I am trying to implement a time fold function to be 'map'ed to various partitions of a dask dataframe which in turn changes the shape of the dataframe in question (or alternatively produces a new dataframe with the altered shape). This is how far I have gotten. The result 'res' returned on compute is a list of 3 delayed objects. When I try to compute each of them in a loop (last tow lines of code) this results in a "TypeError: 'DataFrame' object is not callable" After going through the examples for map_partitions, I also tried altering the input DF (inplace) in the function with no return value which causes a similar TypeError with NoneType. What am I missing?
Also, looking at the visualization (attached) I feel like there is a need for reducing the individually computed (folded) partitions into a single DF. How do I do this?
#! /usr/bin/env python
# Start dask scheduler and workers
# dask-scheduler &
# dask-worker --nthreads 1 --nprocs 6 --memory-limit 3GB localhost:8786 --local-directory /dev/shm &
from dask.distributed import Client
from dask.delayed import delayed
import pandas as pd
import numpy as np
import dask.dataframe as dd
import math
foldbucketsecs=30
periodicitysecs=15
secsinday=24 * 60 * 60
chunksizesecs=60 # 1 minute
numts = 5
start = 1525132800 # 01/05
end = 1525132800 + (3 * 60) # 3 minute
c = Client('127.0.0.1:8786')
def fold(df, start, bucket):
return df
def reduce_folds(df):
return df
def load(epoch):
idx = []
for ts in range(0, chunksizesecs, periodicitysecs):
idx.append(epoch + ts)
d = np.random.rand(chunksizesecs/periodicitysecs, numts)
ts = []
for i in range(0, numts):
tsname = "ts_%s" % (i)
ts.append(tsname)
gts.append(tsname)
res = pd.DataFrame(index=idx, data=d, columns=ts, dtype=np.float64)
res.index = pd.to_datetime(arg=res.index, unit='s')
return res
gts = []
load(start)
cols = len(gts)
idx1 = pd.DatetimeIndex(start=start, freq=('%sS' % periodicitysecs), end=start+periodicitysecs, dtype='datetime64[s]')
meta = pd.DataFrame(index=idx1[:0], data=[], columns=gts, dtype=np.float64)
dfs = [delayed(load)(fn) for fn in range(start, end, chunksizesecs)]
from_delayed = dd.from_delayed(dfs, meta, 'sorted')
nfolds = int(math.ceil((end - start)/foldbucketsecs))
cprime = nfolds * cols
gtsnew = []
for i in range(0, cprime):
gtsnew.append("ts_%s,fold=%s" % (i%cols, i/cols))
idx2 = pd.DatetimeIndex(start=start, freq=('%sS' % periodicitysecs), end=start+foldbucketsecs, dtype='datetime64[s]')
meta = pd.DataFrame(index=idx2[:0], data=[], columns=gtsnew, dtype=np.float64)
folded_df = from_delayed.map_partitions(delayed(fold)(from_delayed, start, foldbucketsecs), meta=meta)
result = c.submit(reduce_folds, folded_df)
c.gather(result).visualize(filename='/usr/share/nginx/html/svg/df4.svg')
res = c.gather(result).compute()
for f in res:
f.compute()