I have a 32GB machine, the csv file is 1 million rows by 4 columns (800MB). When I run the code Python only uses up about 1GB of my memory, but I get a memory error:
MemoryError: Unable to allocate array with shape (23459822,) and data type int64
NOTE: problem only occurs running Windows, Ubuntu the problem vanishes with exact same code
The code in question:
elif light in entry:
df = pandas.read_csv('maps_android_light_raw_20190909.csv')
for i,g in df.groupby('device_id'):
output_file2 = path+f'{i}/LIGHT/'
if not os.path.exists(output_file2):
os.makedirs(output_file2)
g.to_csv(output_file2 + f'{i}.csv', index = False)
del df
The full traceback:
Traceback (most recent call last):
File "light.py", line 49, in <module>
main()
File "light.py", line 33, in main
for i,g in df2:
File "C:\Python37\lib\site-packages\pandas\core\groupby\ops.py", line 164, in get_iterator
for key, (i, group) in zip(keys, splitter):
File "C:\Python37\lib\site-packages\pandas\core\groupby\ops.py", line 899, in __iter__
sdata = self._get_sorted_data()
File "C:\Python37\lib\site-packages\pandas\core\groupby\ops.py", line 918, in _get_sorted_data
return self.data.take(self.sort_idx, axis=self.axis)
File "pandas/_libs/properties.pyx", line 34, in pandas._libs.properties.CachedProperty.__get__
File "C:\Python37\lib\site-packages\pandas\core\groupby\ops.py", line 896, in sort_idx
return get_group_index_sorter(self.labels, self.ngroups)
File "C:\Python37\lib\site-packages\pandas\core\sorting.py", line 349, in get_group_index_sorter
sorter, _ = algos.groupsort_indexer(ensure_int64(group_index), ngroups)
File "pandas/_libs/algos.pyx", line 173, in pandas._libs.algos.groupsort_indexer
MemoryError: Unable to allocate array with shape (23459822,) and data type int64