This is a textbook question on how to add two DataFrames using Dask (specifically with fold)... I can't seem to get it to work though, so I wanted to reach out to see what I'm doing wrong.
(I'm on Python 3.8.5 with Dask 2021.4.1)
The code below shows my intentions:
from dask import delayed, bag
import pandas as pd
def get_df1():
return pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
def get_df2():
return pd.DataFrame({'a': [3, 2, 1], 'b': [6, 5, 4]})
def addit(a, b):
return a + b
if __name__ == '__main__':
# Without dask
y = addit(get_df1(), get_df2())
print(y)
# The above code prints the desired answer:
# a b
# 0 4 10
# 1 4 10
# 2 4 10
# With dask/delayed + bag + fold
xs = [delayed(get_df1)(), delayed(get_df2)()]
b1 = bag.from_delayed(xs)
y = b1.fold(addit)
print(y.compute())
# This prints an unexpected result
# abab
Answer (per comment below):
from dask import delayed, bag
import pandas as pd
def get_df1():
return [pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})] # Now a list
def get_df2():
return [pd.DataFrame({'a': [3, 2, 1], 'b': [6, 5, 4]})] # Now a list
def addit(a, b):
return a + b
if __name__ == '__main__':
# Without dask
y = addit(*get_df1(), *get_df2())
print(y)
# The above code prints the desired answer:
# a b
# 0 4 10
# 1 4 10
# 2 4 10
# With dask/delayed + bag + fold
xs = [delayed(get_df1)(), delayed(get_df2)()]
b1 = bag.from_delayed(xs)
y = b1.fold(addit)
print(y.compute())
# The above code now also prints the desired answer:
# a b
# 0 4 10
# 1 4 10
# 2 4 10