Problem
I used defaultdict
to specify the dtype of each column. When I tried to ffill the dataframe, I got the following error. I think the error was caused by the dtype, but I have to specify it because my data is too large (1394265 rows × 300 columns) and I don't have enough memory. How can I solve the problem and ffill correctly?
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-42-feb22c34d9cb> in <cell line: 4>()
2 dtype['timestamp']=np.int64
3 table1=pd.read_csv(data_root+'err_reprdc_sample.csv',index_col=0,dtype=dtype)
----> 4 table1.ffill()
14 frames
/usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
329 stacklevel=find_stack_level(),
330 )
--> 331 return func(*args, **kwargs)
332
333 # error: "Callable[[VarArg(Any), KwArg(Any)], Any]" has no
/usr/local/lib/python3.10/dist-packages/pandas/core/frame.py in ffill(self, axis, inplace, limit, downcast)
11781 downcast: dict | None = None,
11782 ) -> DataFrame | None:
> 11783 return super().ffill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)
11784
11785 @overload
/usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
329 stacklevel=find_stack_level(),
330 )
--> 331 return func(*args, **kwargs)
332
333 # error: "Callable[[VarArg(Any), KwArg(Any)], Any]" has no
/usr/local/lib/python3.10/dist-packages/pandas/core/generic.py in ffill(self, axis, inplace, limit, downcast)
6985 Object with missing values filled or None if ``inplace=True``.
6986 """
-> 6987 return self.fillna(
6988 method="ffill", axis=axis, inplace=inplace, limit=limit, downcast=downcast
6989 )
/usr/local/lib/python3.10/dist-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
329 stacklevel=find_stack_level(),
330 )
--> 331 return func(*args, **kwargs)
332
333 # error: "Callable[[VarArg(Any), KwArg(Any)], Any]" has no
/usr/local/lib/python3.10/dist-packages/pandas/core/frame.py in fillna(self, value, method, axis, inplace, limit, downcast)
5633 downcast: dict | None = None,
5634 ) -> DataFrame | None:
-> 5635 return super().fillna(
5636 value=value,
5637 method=method,
/usr/local/lib/python3.10/dist-packages/pandas/core/generic.py in fillna(self, value, method, axis, inplace, limit, downcast)
6811 return result
6812
-> 6813 new_data = self._mgr.interpolate(
6814 method=method,
6815 axis=axis,
/usr/local/lib/python3.10/dist-packages/pandas/core/internals/managers.py in interpolate(self, **kwargs)
420
421 def interpolate(self: T, **kwargs) -> T:
--> 422 return self.apply("interpolate", **kwargs)
423
424 def shift(self: T, periods: int, axis: int, fill_value) -> T:
/usr/local/lib/python3.10/dist-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
350 applied = b.apply(f, **kwargs)
351 else:
--> 352 applied = getattr(b, f)(**kwargs)
353 except (TypeError, NotImplementedError):
354 if not ignore_failures:
/usr/local/lib/python3.10/dist-packages/pandas/core/internals/blocks.py in interpolate(self, method, axis, index, inplace, limit, limit_direction, limit_area, fill_value, downcast, **kwargs)
1256 data = cast(np.ndarray, data) # bc overridden by ExtensionBlock
1257
-> 1258 missing.interpolate_array_2d(
1259 data,
1260 method=method,
/usr/local/lib/python3.10/dist-packages/pandas/core/missing.py in interpolate_array_2d(data, method, axis, index, limit, limit_direction, limit_area, fill_value, coerce, downcast, **kwargs)
237 raise ValueError("Cannot pass both fill_value and method")
238
--> 239 interpolate_2d(
240 data,
241 method=m,
/usr/local/lib/python3.10/dist-packages/pandas/core/missing.py in interpolate_2d(values, method, axis, limit, limit_area)
813 # _pad_2d and _backfill_2d both modify tvalues inplace
814 if method == "pad":
--> 815 _pad_2d(tvalues, limit=limit)
816 else:
817 _backfill_2d(tvalues, limit=limit)
/usr/local/lib/python3.10/dist-packages/pandas/core/missing.py in new_func(values, limit, mask)
847 return result.view(values.dtype), mask
848
--> 849 return func(values, limit=limit, mask=mask)
850
851 return cast(F, new_func)
/usr/local/lib/python3.10/dist-packages/pandas/core/missing.py in _pad_2d(values, limit, mask)
879
880 if np.all(values.shape):
--> 881 algos.pad_2d_inplace(values, mask, limit=limit)
882 else:
883 # for test coverage
/usr/local/lib/python3.10/dist-packages/pandas/_libs/algos.pyx in pandas._libs.algos.__pyx_fused_cpdef()
TypeError: No matching signature found
Reproduce the problem
import pandas as pd
import numpy as np
from collections import defaultdict
data_root='./'
dtype=defaultdict(np.float16)
dtype['timestamp']=np.int64
table1=pd.read_csv(data_root+'err_reprdc_sample.csv',index_col=0,dtype=dtype)
table1.ffill()
You can download the sample file called 'err_reprdc_sample' and adjust the data_root
.
timestamp | 000001.XSHE | 000002.XSHE | 000596.XSHE |
---|---|---|---|
20220628092500000 | 14.47 | ||
20220628092500010 | 18.5 | ||
20220628092500020 | 234.1 | ||
20220628092500040 | |||
20220628092500050 |
What I have tried
table1=pd.read_csv(data_root+'err_reprdc_sample.csv',index_col=0)
table1.ffill()
or Changing np.float16
to np.float32
works fine but it will take too much memory.