I am trying to read a x-bzip2 StreamingObject file. The file is large in size and does not fit in memory, meaning I need to unzip it, readlines and process in chunks.
I have managed to read specified bytes (5MB, 10MB) of chunksize, however, I need to read the whole file not just first chunk. How can I do that?
code Class that reads chunk_size
class ParseStreamingZip(object):
def __init__(self, obj, chunk_size=(1024*1024*5)):
"Chunksize: (default:3MB)"
self.meta = obj
self.chunk_size=chunk_size
self.streaming_obj = obj['Body']
self.streaming_obj.set_socket_timeout(9999999)
def parse(self, byte_size=None, dec='utf8') -> list:
"Opens the bz2 file in binary, reads lines of `byte_size` loading with json decoding `dec`."
byte_size = byte_size if byte_size != None else self.chunk_size
output = []
with bz2.BZ2File(self.streaming_obj,'rb') as f:
content = f.readlines(byte_size)
for line in content:
try:
jline = json.loads(line.decode(str(dec)).strip('\n'))
output.append(jline)
except Exception as e:
print('Caught: ', e)
pass
return output
code that processes the first chunk_size (5MB)
filename = 'wls_day-78.bz2'
# get the data
response_obj = s3.get_object(Bucket=dataset_metadata['bucket'], Key=filename)
print('+ object received.')
# capture object info
content_type = response_obj['ResponseMetadata']['HTTPHeaders']['content-type']
content_length = int(response_obj['ResponseMetadata']['HTTPHeaders']['content-length'])
print('content type & length:', content_type, content_length, "({:.1f} MB)".format(int(content_length)/1024/1024))
streaming_body = response_obj['Body']
streaming_body.set_socket_timeout(999999) # set a long timeout
chunks = 10
chunksize = content_length // chunks
print(chunks,'chunks of size:', chunksize)
print('Leftover bytes: ', content_length - (chunks*chunksize))
>>> + object received.
>>> content type & length: application/x-bzip2 493878811 (471.0MB)
>>> 10 chunks of size: 49387881
>>> Leftover bytes: 1
obj = ParseStreamingZip(response_obj)
parsed_data = obj.parse()
df = pd.DataFrame(parsed_data)
print(df.memory_usage().sum())
>>> memory usage: 3.8+ MB