I have a huge dataset with millions of entries (It is a normal .csv file and I get no errors when I load it with pandas). Pandas struggles when trying to load the dataset (.csv), so I decided to use modin, which apparently allows you to use multiple processes with only one line difference. When I run:
train_df = pd.read_csv("train_data.csv")
I get the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-16-03696c0c8326> in <module>
----> 1 train_df = pd.read_csv("train_data.csv")
2 train_df
~\anaconda3\lib\site-packages\modin\logging\logger_function.py in run_and_log(*args, **kwargs)
63 """
64 if LogMode.get() == "disable":
---> 65 return f(*args, **kwargs)
66
67 logger = get_logger()
~\anaconda3\lib\site-packages\modin\pandas\io.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, skipfooter, doublequote, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
138 _, _, _, f_locals = inspect.getargvalues(inspect.currentframe())
139 kwargs = {k: v for k, v in f_locals.items() if k in _pd_read_csv_signature}
--> 140 return _read(**kwargs)
141
142
~\anaconda3\lib\site-packages\modin\pandas\io.py in _read(**kwargs)
59
60 squeeze = kwargs.pop("squeeze", False)
---> 61 pd_obj = FactoryDispatcher.read_csv(**kwargs)
62 # This happens when `read_csv` returns a TextFileReader object for iterating through
63 if isinstance(pd_obj, pandas.io.parsers.TextFileReader):
~\anaconda3\lib\site-packages\modin\core\execution\dispatching\factories\dispatcher.py in read_csv(cls, **kwargs)
183 @_inherit_docstrings(factories.BaseFactory._read_csv)
184 def read_csv(cls, **kwargs):
--> 185 return cls.__factory._read_csv(**kwargs)
186
187 @classmethod
~\anaconda3\lib\site-packages\modin\core\execution\dispatching\factories\factories.py in _read_csv(cls, **kwargs)
215 )
216 def _read_csv(cls, **kwargs):
--> 217 return cls.io_cls.read_csv(**kwargs)
218
219 @classmethod
~\anaconda3\lib\site-packages\modin\logging\logger_function.py in run_and_log(*args, **kwargs)
63 """
64 if LogMode.get() == "disable":
---> 65 return f(*args, **kwargs)
66
67 logger = get_logger()
~\anaconda3\lib\site-packages\modin\core\io\file_dispatcher.py in read(cls, *args, **kwargs)
151 postprocessing work on the resulting query_compiler object.
152 """
--> 153 query_compiler = cls._read(*args, **kwargs)
154 # TODO (devin-petersohn): Make this section more general for non-pandas kernel
155 # implementations.
~\anaconda3\lib\site-packages\modin\logging\logger_function.py in run_and_log(*args, **kwargs)
63 """
64 if LogMode.get() == "disable":
---> 65 return f(*args, **kwargs)
66
67 logger = get_logger()
~\anaconda3\lib\site-packages\modin\core\io\text\text_file_dispatcher.py in _read(cls, filepath_or_buffer, **kwargs)
1053 )
1054 f.seek(old_pos)
-> 1055 splits = cls.partitioned_file(
1056 f,
1057 num_partitions=NPartitions.get(),
~\anaconda3\lib\site-packages\modin\logging\logger_function.py in run_and_log(*args, **kwargs)
63 """
64 if LogMode.get() == "disable":
---> 65 return f(*args, **kwargs)
66
67 logger = get_logger()
~\anaconda3\lib\site-packages\modin\core\io\text\text_file_dispatcher.py in partitioned_file(cls, f, num_partitions, nrows, skiprows, quotechar, is_quoting, encoding, newline, header_size, pre_reading)
270 file_size = cls.file_size(f)
271
--> 272 rows_skipper(header_size)
273
274 if pre_reading:
~\anaconda3\lib\site-packages\modin\core\io\text\text_file_dispatcher.py in skipper(n)
496 return 0
497 else:
--> 498 return cls._read_rows(
499 f,
500 quotechar=quotechar,
~\anaconda3\lib\site-packages\modin\logging\logger_function.py in run_and_log(*args, **kwargs)
63 """
64 if LogMode.get() == "disable":
---> 65 return f(*args, **kwargs)
66
67 logger = get_logger()
~\anaconda3\lib\site-packages\modin\core\io\text\text_file_dispatcher.py in _read_rows(cls, f, nrows, quotechar, is_quoting, outside_quotes, encoding, newline)
392 iterator = f
393
--> 394 for line in iterator:
395 if is_quoting and line.count(quotechar) % 2:
396 outside_quotes = not outside_quotes
TypeError: 'LocalFileOpener' object is not iterable
Here is the full code:
import modin.pandas as pd
train_df = pd.read_csv("train_data.csv")
From my research this apparently has something to do with the fsspec versions (from what I could understand)
I get the same error when I run this code:
import fsspec
file_path = r"./train_data.csv"
file = fsspec.open(file_path).open()
for line in file:
print(line)
Error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-15-3fc27fe187bb> in <module>
4 file = fsspec.open(file_path).open()
5
----> 6 for line in file:
7 print(line)
TypeError: 'LocalFileOpener' object is not iterable
Some version information:
fsspec version: 0.7.4
modin version: 0.15.2
Windows version: 11
Python version: 3.8.3
Can someone help me what the problem here is and how I can solve it.
Thanks in advance :)