While using dask.distributed I'm trying to load dask dataframe from CSV on S3 inside delayed function like this:
@delayed
def func1():
...
return df.read_csv(*s3_url*, ...)
read_csv() does not need interaction with distributed client, so I assumed it's possible. Then on the client machine I compute the delayed object returned by func1.
res = func1()
future = client.compute(res)
progress(future)
frame = client.gather(future)
Until that point it looks good, printing the result
Dask DataFrame Structure:
COL1 COL2
npartitions=9
object object
... ...
... ... ...
... ...
... ...
Dask Name: from-delayed, 27 tasks
However, it fails with Failed to serialize (<dask.bytes.core.OpenFile object at ...>, ..., ..., '\n'). Exception: can't pickle thread.lock objects
when I try to further process it, e.g.
client.compute(frame)
Is there a way to get this scheme to work or I'm missing some more fundamental limitation here?
PS. error log that I'm getting:
.pickle - Failed to serialize (<dask.bytes.core.OpenFile object at ...>, 20971520, 10485760, '\n'). Exception: can't pickle thread.lock objects
ERROR:2017-11-10 15:31:31:root:Exception while executing graph: can't pickle thread.lock objects
Traceback (most recent call last):
...
client.compute(res.data)
File ".../python2.7/site-packages/distributed/client.py", line 2089, in compute
resources=resources)
File ".../python2.7/site-packages/distributed/client.py", line 1906, in _graph_to_futures
'tasks': valmap(dumps_task, dsk3),
File ".../python2.7/site-packages/toolz-0.8.2-py2.7.egg/toolz/dicttoolz.py", line 84, in valmap
rv.update(zip(iterkeys(d), map(func, itervalues(d))))
File ".../python2.7/site-packages/distributed/worker.py", line 731, in dumps_task
'args': pickle.dumps(task[1:])}
File ".../python2.7/site-packages/distributed/protocol/pickle.py", line 51, in dumps
return cloudpickle.dumps(x, protocol=pickle.HIGHEST_PROTOCOL)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 829, in dumps
cp.dump(obj)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 233, in dump
return Pickler.dump(self, obj)
File "...python2.7/pickle.py", line 224, in dump
self.save(obj)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 568, in save_tuple
save(element)
File "...python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 727, in save_reduce
save(state)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "...python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 564, in save_instancemethod
obj=obj)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 709, in save_reduce
save(args)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 554, in save_tuple
save(element)
File "...python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 727, in save_reduce
save(state)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "...python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "...python2.7/pickle.py", line 692, in _batch_setitems
save(v)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 554, in save_tuple
save(element)
File "...python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 727, in save_reduce
save(state)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "...python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "...python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File ".../python2.7/site-packages/cloudpickle/cloudpickle.py", line 727, in save_reduce
save(state)
File "...python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "...python2.7/pickle.py", line 655, in save_dict
self._batch_setitems(obj.iteritems())
File "...python2.7/pickle.py", line 687, in _batch_setitems
save(v)
File "...python2.7/pickle.py", line 306, in save
rv = reduce(self.proto)
TypeError: can't pickle thread.lock objects