I have two dataframes in pandas. I would like to merge these two dataframes, but I keep running into Memory Errors. What is a work around I could use?
Here is the setup:
import pandas as pd
df1 = pd.read_cvs("first1.csv")
df2 = pd.read_csv("second2.csv")
print(df1.shape) # output: (4757076, 4)
print(df2.shape) # output: (428764, 45)
df1.head
column1 begin end category
0 class1 10001 10468 third
1 class1 10469 11447 third
2 class1 11505 11675 fourth
3 class2 15265 15355 seventh
4 class2 15798 15849 second
print(df2.shape) # (428764, 45)
column1 begin ....
0 class1 10524 ....
1 class1 10541 ....
2 class1 10549 ....
3 class1 10565 ...
4 class1 10596 ...
I would simply like to merge these two DataFrames on "column1". However, this always causes a memory error.
Let's try this in pandas first, on a system with approximately 2 TB of RAM and hundreds of threads:
import pandas as pd
df1 = pd.read_cvs("first1.csv")
df2 = pd.read_csv("second2.csv")
merged = pd.merge(df1, df2, on="column1", how="outer", suffixes=("","_repeated")
Here's the error I get:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/nfs/sw/python/python-3.5.1/lib/python3.5/site-packages/pandas/tools/merge.py", line 39, in merge
return op.get_result()
File "/nfs/sw/python/python-3.5.1/lib/python3.5/site-packages/pandas/tools/merge.py", line 217, in get_result
join_index, left_indexer, right_indexer = self._get_join_info()
File "/nfs/sw/python/python-3.5.1/lib/python3.5/site-packages/pandas/tools/merge.py", line 353, in _get_join_info
sort=self.sort, how=self.how)
File "/nfs/sw/python/python-3.5.1/lib/python3.5/site-packages/pandas/tools/merge.py", line 559, in _get_join_indexers
return join_func(lkey, rkey, count, **kwargs)
File "pandas/src/join.pyx", line 160, in pandas.algos.full_outer_join (pandas/algos.c:61256)
MemoryError
That didn't work. Let's try with dask:
import pandas as pd
import dask.dataframe as dd
from numpy import nan
ddf1 = dd.from_pandas(df1, npartitions=2)
ddf2 = dd.from_pandas(df2, npartitions=2)
merged = dd.merge(ddf1, ddf2, on="column1", how="outer", suffixes=("","_repeat")).compute(num_workers=60)
Here's the error I get:
Traceback (most recent call last):
File "repeat_finder.py", line 15, in <module>
merged = dd.merge(ddf1, ddf2,on="column1", how="outer", suffixes=("","_repeat")).compute(num_workers=60)
File "/path/python3.5/site-packages/dask/base.py", line 78, in compute
return compute(self, **kwargs)[0]
File "/path/python3.5/site-packages/dask/base.py", line 178, in compute
results = get(dsk, keys, **kwargs)
File "/path/python3.5/site-packages/dask/threaded.py", line 69, in get
**kwargs)
File "/path/python3.5/site-packages/dask/async.py", line 502, in get_async
raise(remote_exception(res, tb))
dask.async.MemoryError:
Traceback
---------
File "/path/python3.5/site-packages/dask/async.py", line 268, in execute_task
result = _execute_task(task, data)
File "/path/python3.5/site-packages/dask/async.py", line 249, in _execute_task
return func(*args2)
File "/path/python3.5/site-packages/dask/dataframe/methods.py", line 221, in merge
suffixes=suffixes, indicator=indicator)
File "/path/python3.5/site-packages/pandas/tools/merge.py", line 59, in merge
return op.get_result()
File "/path/python3.5/site-packages/pandas/tools/merge.py", line 503, in get_result
join_index, left_indexer, right_indexer = self._get_join_info()
File "/path/python3.5/site-packages/pandas/tools/merge.py", line 667, in _get_join_info
right_indexer) = self._get_join_indexers()
File "/path/python3.5/site-packages/pandas/tools/merge.py", line 647, in _get_join_indexers
how=self.how)
File "/path/python3.5/site-packages/pandas/tools/merge.py", line 876, in _get_join_indexers
return join_func(lkey, rkey, count, **kwargs)
File "pandas/src/join.pyx", line 226, in pandas._join.full_outer_join (pandas/src/join.c:11286)
File "pandas/src/join.pyx", line 231, in pandas._join._get_result_indexer (pandas/src/join.c:11474)
File "path/python3.5/site-packages/pandas/core/algorithms.py", line 1072, in take_nd
out = np.empty(out_shape, dtype=dtype, order='F')
How could I get this to work, even if it was shamelessly inefficient?
EDIT: In response to the suggestion of merging on two columns/indices, I don't think I can do this. Here is the code I am trying to run:
import pandas as pd
import dask.dataframe as dd
df1 = pd.read_cvs("first1.csv")
df2 = pd.read_csv("second2.csv")
ddf1 = dd.from_pandas(df1, npartitions=2)
ddf2 = dd.from_pandas(df2, npartitions=2)
merged = dd.merge(ddf1, ddf2, on="column1", how="outer", suffixes=("","_repeat")).compute(num_workers=60)
merged = merged[(ddf1.column1 == row.column1) & (ddf2.begin >= ddf1.begin) & (ddf2.begin <= ddf1.end)]
merged = dd.merge(ddf2, merged, on = ["column1"]).compute(num_workers=60)
merged.to_csv("output.csv", index=False)