I have the following code to read a gzipped csv file from bytes. It works with pandas.read_csv, however, it fails with dask (dd.read_csv).
File in d['urls'][0]
is a link to a file on Amazon S3 provided by a third-party service.
import io
import requests
import pandas
import dask.dataframe as dd
output = io.BytesIO()
output.name = "chunk_1.csv.gz"
with requests.get(d['urls'][0], stream=True) as resp:
resp.raise_for_status()
for chunk in resp.iter_content(chunk_size=None):
if chunk:
output.write(chunk)
output.seek(0)
dd.read_csv(output, compression='gzip', blocksize=None) #Doesn't work
pd.read_csv(output, compression='gzip') # WORKS
Traceback:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-13-39441d60668b> in <module>
13 output.seek(0)
14
---> 15 dd.read_csv(output, compression='gzip', blocksize=None) #Doesn't work
16
17 pd.read
~/opt/anaconda3/lib/python3.8/site-packages/dask/dataframe/io/csv.py in read(urlpath, blocksize, lineterminator, compression, sample, enforce, assume_missing, storage_options, include_path_column, **kwargs)
698 **kwargs,
699 ):
--> 700 return read_pandas(
701 reader,
702 urlpath,
~/opt/anaconda3/lib/python3.8/site-packages/dask/dataframe/io/csv.py in read_pandas(reader, urlpath, blocksize, lineterminator, compression, sample, enforce, assume_missing, storage_options, include_path_column, **kwargs)
533 sample = blocksize
534 b_lineterminator = lineterminator.encode()
--> 535 b_out = read_bytes(
536 urlpath,
537 delimiter=b_lineterminator,
~/opt/anaconda3/lib/python3.8/site-packages/dask/bytes/core.py in read_bytes(urlpath, delimiter, not_zero, blocksize, sample, compression, include_path, **kwargs)
93 """
94 if not isinstance(urlpath, (str, list, tuple, os.PathLike)):
---> 95 raise TypeError("Path should be a string, os.PathLike, list or tuple")
96
97 fs, fs_token, paths = get_fs_token_paths(urlpath, mode="rb", storage_options=kwargs)
TypeError: Path should be a string, os.PathLike, list or tuple
The url I'm trying to get file from looks like https://user-ad-revenue.s3.amazonaws.com/data/XXXX/uar/tables/mediation/XXXX%3Dv3/publisher_id%XXXXX/application_id%XXXXX/day%3D2020-12-27/report.csv.gz?AWSAccessKeyId=XXXXX&Expires=1609150335&Signature=XXXXX
Reading from http with dask dd.read_csv(d['urls'][0], compression='gzip', blocksize=None)
returns BadGzipFile: Not a gzipped file (b'<?')
, however it works with pd.read_csv