0

I am trying to implement numba with dask using a simple groupby operation on a dataset.It is working fine on a single system but as I move ahead to apply it on a distributed one ,it is giving error which I am unable to get through.Please help.Thanks in advance.
This is the code.

    import pandas as pd
    import time
    import dask as dask
    import dask.distributed as distributed
    import dask.dataframe as dd
    import dask.delayed as delayed
    from dask.distributed import Client,progress
    from numba import jit

    @jit 
    def group_sum(data_frame):
    data_frame = data_frame.groupby(['col1'])[['col2']].sum(split_out=10)
    return data_frame

   client = Client('IP:PORT')
   print client
   print client.scheduler_info()
   f = []
   chunksize = 10 ** 6
   for chunk in pd.read_csv('file.csv', chunksize=chunksize):
   f_in = client.scatter(chunk)
   f.append(f_in)
   ddf = dd.from_delayed(f)
   ddf = group_sum(ddf)
   c = ddf.to_delayed()
   future = client.compute(c)
   progress(future)
   result = client.gather(future)
   print result

The error traceback is as follows:

    f_in = client.scatter(chunk)
    File "/usr/local/lib/python2.7/site-packages/distributed/client.py", line 1773, in scatter
asynchronous=asynchronous, hash=hash)
    File "/usr/local/lib/python2.7/site-packages/distributed/client.py", line 652, in sync
return sync(self.loop, func, *args, **kwargs)
    File "/usr/local/lib/python2.7/site-packages/distributed/utils.py", line 275, in sync
six.reraise(*error[0])
    File "/usr/local/lib/python2.7/site-packages/distributed/utils.py", line 260, in f
result[0] = yield make_coro()
   File "/usr/local/lib/python2.7/site-packages/tornado/gen.py", line 1099, in run
value = future.result()
   File "/usr/local/lib/python2.7/site-packages/tornado/concurrent.py", line 260, in result
raise_exc_info(self._exc_info)
   File "/usr/local/lib/python2.7/site-packages/tornado/gen.py", line 1107, in run
yielded = self.gen.throw(*exc_info)
   File "/usr/local/lib/python2.7/site-packages/distributed/client.py", line 1641, in _scatter
timeout=timeout)
   File "/usr/local/lib/python2.7/site-packages/tornado/gen.py", line 1099, in run
value = future.result()
   File "/usr/local/lib/python2.7/site-packages/tornado/concurrent.py", line 260, in result
raise_exc_info(self._exc_info)
   File "/usr/local/lib/python2.7/site-packages/tornado/gen.py", line 1107, in run
yielded = self.gen.throw(*exc_info)
   File "/usr/local/lib/python2.7/site-packages/distributed/core.py", line 552, in send_recv_from_rpc
result = yield send_recv(comm=comm, op=key, **kwargs)
   File "/usr/local/lib/python2.7/site-packages/tornado/gen.py", line 1099, in run
value = future.result()
   File "/usr/local/lib/python2.7/site-packages/tornado/concurrent.py", line 260, in result
raise_exc_info(self._exc_info)
   File "/usr/local/lib/python2.7/site-packages/tornado/gen.py", line 1113, in run
yielded = self.gen.send(value)
   File "/usr/local/lib/python2.7/site-packages/distributed/core.py", line 446, in send_recv
six.reraise(*clean_exception(**response))
   File "/usr/local/lib/python2.7/site-packages/distributed/core.py", line 321, in handle_comm
result = yield result
   File "/usr/local/lib/python2.7/site-packages/tornado/gen.py", line 1099, in run
value = future.result()
   File "/usr/local/lib/python2.7/site-packages/tornado/concurrent.py", line 260, in result
raise_exc_info(self._exc_info)
   File "/usr/local/lib/python2.7/site-packages/tornado/gen.py", line 1107, in run
yielded = self.gen.throw(*exc_info)
   File "/usr/local/lib/python2.7/site-packages/distributed/scheduler.py", line 2155, in scatter
report=False)
   File "/usr/local/lib/python2.7/site-packages/tornado/gen.py", line 1099, in run
value = future.result()
  File "/usr/local/lib/python2.7/site-packages/tornado/concurrent.py", line 260, in result
raise_exc_info(self._exc_info)
  File "/usr/local/lib/python2.7/site-packages/tornado/gen.py", line 1107, in run
yielded = self.gen.throw(*exc_info)
  File "/usr/local/lib/python2.7/site-packages/distributed/utils_comm.py", line 128, in scatter_to_workers
for address, v in d.items()])
  File "/usr/local/lib/python2.7/site-packages/tornado/gen.py", line 1099, in run
value = future.result()
  File "/usr/local/lib/python2.7/site-packages/tornado/concurrent.py", line 260, in result
raise_exc_info(self._exc_info)
  File "/usr/local/lib/python2.7/site-packages/tornado/gen.py", line 1107, in run
yielded = self.gen.throw(*exc_info)
  File "/usr/local/lib/python2.7/site-packages/distributed/utils.py", line 208, in All
result = yield tasks.next()
  File "/usr/local/lib/python2.7/site-packages/tornado/gen.py", line 1099, in run
value = future.result()
  File "/usr/local/lib/python2.7/site-packages/tornado/concurrent.py", line 260, in result
raise_exc_info(self._exc_info)
  File "/usr/local/lib/python2.7/site-packages/tornado/gen.py", line 1107, in run
yielded = self.gen.throw(*exc_info)
  File "/usr/local/lib/python2.7/site-packages/distributed/core.py", line 610, in send_recv_from_rpc
result = yield send_recv(comm=comm, op=key, **kwargs)
  File "/usr/local/lib/python2.7/site-packages/tornado/gen.py", line 1099, in run
value = future.result()
  File "/usr/local/lib/python2.7/site-packages/tornado/concurrent.py", line 260, in result
raise_exc_info(self._exc_info)
  File "/usr/local/lib/python2.7/site-packages/tornado/gen.py", line 1113, in run
yielded = self.gen.send(value)
  File "/usr/local/lib/python2.7/site-packages/distributed/core.py", line 446, in send_recv
six.reraise(*clean_exception(**response))
  File "<string>", line 3, in reraise
  TypeError: raise: arg 3 must be a traceback or None
Sweta
  • 63
  • 3
  • 13
  • 2
    Please try to reduce this to a minimal example, also, we don't have your data. Note that your numba function doesn't appear to be numba friendly and likely doesn't achieve any speedup. – mdurant Aug 22 '18 at 12:57
  • how can I improve the numba function and why am I getting this error ? – Sweta Aug 22 '18 at 13:18
  • a) read the numba documentation on its use cases b) I don't know, it looks complicated. – mdurant Aug 22 '18 at 14:05
  • http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports – MRocklin Sep 02 '18 at 12:57

0 Answers0