For a research project, I need to convert reddit data from 2018 from .zst to parquet in order to share the data with my partners. In order ot do this, I modified, Watchful1's .zst-to.csv converter. I made the following changes:
- Line 67-72: create parquet fields for the table that are relevant
- Line 73: create the parquetwriter using the above schema
- Line 79-83: took the data from the relevant ndjason and put it into a dictionary so that it could be turned into a pandas table and then written as a parquet file
- Some miscellaneous print statements for debugging
When I run the code on smaller .zst everything seems to work epffectly, however, when I tried running it on one of the actual reddit backups (which is 2.42 GB in size), the code stopped printing status updates at around 30-40%, and never printed The complete message in the end. I imagine it has something to do with the file size, or the fact that I left it running for a few hours, but I'm honestly not sure. What am I doing wrong?
Here is the code, which is run with the following args:
python3 to_csv.py reddit/submissions/RS_2018-01.zst newOutput/submissions/RS_2018-01.parquet author,title,subreddit,selftext
# this converts a zst file to csv
#
# it's important to note that the resulting file will likely be quite large
# and you probably won't be able to open it in excel or another csv reader
#
# arguments are inputfile, outputfile, fields
# call this like
# python to_csv.py wallstreetbets_submissions.zst wallstreetbets_submissions.csv author,selftext,title
import zstandard
import os
import json
import sys
import csv
from datetime import datetime
import logging.handlers
import pyarrow.parquet as pd
import pyarrow as pa
log = logging.getLogger("bot")
log.setLevel(logging.DEBUG)
log.addHandler(logging.StreamHandler())
def read_and_decode(reader, chunk_size, max_window_size, previous_chunk=None, bytes_read=0):
chunk = reader.read(chunk_size)
bytes_read += chunk_size
if previous_chunk is not None:
chunk = previous_chunk + chunk
try:
return chunk.decode()
except UnicodeDecodeError:
if bytes_read > max_window_size:
raise UnicodeError(f"Unable to decode frame after reading {bytes_read:,} bytes")
return read_and_decode(reader, chunk_size, max_window_size, chunk, bytes_read)
def read_lines_zst(file_name):
with open(file_name, 'rb') as file_handle:
buffer = ''
reader = zstandard.ZstdDecompressor(max_window_size=2**31).stream_reader(file_handle)
while True:
chunk = read_and_decode(reader, 2**27, (2**29) * 2)
if not chunk:
break
lines = (buffer + chunk).split("\n")
for line in lines[:-1]:
yield line, file_handle.tell()
buffer = lines[-1]
print("donewwhileloop")
reader.close()
if __name__ == "__main__":
input_file_path = sys.argv[1]
output_file_path = sys.argv[2]
fields = sys.argv[3].split(",")
file_size = os.stat(input_file_path).st_size
file_lines = 0
file_bytes_processed = 0
line = None
created = None
bad_lines = 0
output_file = open(output_file_path, "wb")
my_schema = pa.schema([
pa.field('author',pa.large_string()),
pa.field('title',pa.large_string()),
pa.field('subreddit',pa.large_string()),
pa.field('selftext',pa.large_string())
])
writer = pd.ParquetWriter(output_file,schema=my_schema)
try:
for line, file_bytes_processed in read_lines_zst(input_file_path):
try:
obj = json.loads(line)
output_obj = [{}]
for field in fields:
output_obj[0][field] = obj[field]
ourTable = pa.Table.from_pylist(output_obj,schema = my_schema)
writer.write_table(ourTable)
created = datetime.utcfromtimestamp(int(obj['created_utc']))
except json.JSONDecodeError as err:
bad_lines += 1
print("bad line")
file_lines += 1
if file_lines % 100000 == 0:
log.info(f"{created.strftime('%Y-%m-%d %H:%M:%S')} : {file_lines:,} : {bad_lines:,} : {(file_bytes_processed / file_size) * 100:.0f}%")
except KeyError as err:
log.info(f"Object has no key: {err}")
log.info(line)
except Exception as err:
log.info(err)
log.info(line)
print("done")
writer.close()
output_file.close()
log.info(f"Complete : {file_lines:,} : {bad_lines:,}")
Tried it on a smaller file, which when opened worked perfectly, also put various print statements, which showed that the file was reading zst files correctly and putting them into the appropriate parquet file.
EDIT: tried it again, got the same error, here is the console output:
D:\Reddit Data>python3 to_csv.py reddit/submissions/RS_2018-01.zst newOutput/submissions/RS_2018-01.parquet author,title,subreddit,selftext
2018-01-01 08:36:07 : 100,000 : 0 : 1%
2018-01-01 17:59:14 : 200,000 : 0 : 2%
2018-01-02 00:13:14 : 300,000 : 0 : 3%
2018-01-02 07:35:53 : 400,000 : 0 : 4%
2018-01-02 15:59:06 : 500,000 : 0 : 5%
2018-01-02 21:20:03 : 600,000 : 0 : 5%
2018-01-03 03:03:10 : 700,000 : 0 : 7%
2018-01-03 11:10:54 : 800,000 : 0 : 8%
2018-01-03 17:44:17 : 900,000 : 0 : 8%
2018-01-03 22:56:23 : 1,000,000 : 0 : 9%
2018-01-04 05:05:15 : 1,100,000 : 0 : 10%
2018-01-04 13:38:00 : 1,200,000 : 0 : 11%
2018-01-04 19:09:57 : 1,300,000 : 0 : 12%
2018-01-05 01:04:43 : 1,400,000 : 0 : 12%
2018-01-05 07:49:19 : 1,500,000 : 0 : 13%
2018-01-05 15:44:16 : 1,600,000 : 0 : 15%
2018-01-05 20:51:32 : 1,700,000 : 0 : 15%
2018-01-06 02:37:28 : 1,800,000 : 0 : 16%
2018-01-06 11:08:28 : 1,900,000 : 0 : 17%
2018-01-06 18:28:24 : 2,000,000 : 0 : 18%
2018-01-07 00:19:18 : 2,100,000 : 0 : 19%
2018-01-07 07:52:00 : 2,200,000 : 0 : 20%
2018-01-07 16:56:06 : 2,300,000 : 0 : 20%