0

I am trying to read a x-bzip2 StreamingObject file. The file is large in size and does not fit in memory, meaning I need to unzip it, readlines and process in chunks.

I have managed to read specified bytes (5MB, 10MB) of chunksize, however, I need to read the whole file not just first chunk. How can I do that?

code Class that reads chunk_size

class ParseStreamingZip(object):
    def __init__(self, obj, chunk_size=(1024*1024*5)):
        "Chunksize: (default:3MB)"
        self.meta = obj
        self.chunk_size=chunk_size
        self.streaming_obj = obj['Body']
        self.streaming_obj.set_socket_timeout(9999999)

    def parse(self, byte_size=None, dec='utf8') -> list:
        "Opens the bz2 file in binary, reads lines of `byte_size` loading with json decoding `dec`."
        byte_size = byte_size if byte_size != None else self.chunk_size
        output = []
        with bz2.BZ2File(self.streaming_obj,'rb') as f:
            content = f.readlines(byte_size)
            for line in content:
                try:
                    jline = json.loads(line.decode(str(dec)).strip('\n'))
                    output.append(jline)
                except Exception as e:
                    print('Caught: ', e)
                    pass
        return output

code that processes the first chunk_size (5MB)

filename = 'wls_day-78.bz2'    

# get the data
response_obj = s3.get_object(Bucket=dataset_metadata['bucket'], Key=filename)
print('+ object received.')

# capture object info
content_type = response_obj['ResponseMetadata']['HTTPHeaders']['content-type']
content_length = int(response_obj['ResponseMetadata']['HTTPHeaders']['content-length'])
print('content type & length:', content_type, content_length, "({:.1f} MB)".format(int(content_length)/1024/1024))

streaming_body = response_obj['Body']
streaming_body.set_socket_timeout(999999) # set a long timeout


chunks = 10
chunksize = content_length // chunks
print(chunks,'chunks of size:', chunksize)
print('Leftover bytes: ', content_length - (chunks*chunksize))

>>> + object received.
>>> content type & length: application/x-bzip2 493878811 (471.0MB)
>>> 10 chunks of size: 49387881
>>> Leftover bytes:  1

obj = ParseStreamingZip(response_obj)
parsed_data = obj.parse()
df = pd.DataFrame(parsed_data)
print(df.memory_usage().sum())

>>> memory usage: 3.8+ MB



Simas
  • 642
  • 8
  • 15

1 Answers1

0

Managed to figure this one out without even using the class method, by appending each chunk result to a CSV file, allows to process any large file without running out of memory.

Code for anyone:

import bz2
import json
import ibm_boto3
from ibm_botocore.client import Config
import pandas as pd 

def main(filename:str):
    # open creds to access data
    with open('creds.json', 'r') as f:
        dataset_metadata = json.load(f)
    f.close()
    print('+ creds opened.')

    # create a client instance
    s3 = ibm_boto3.client(
        service_name='s3',
        ibm_api_key_id=dataset_metadata['api_key'],
        ibm_service_instance_id=dataset_metadata['resource_instance_id'],
        ibm_auth_endpoint=dataset_metadata['iam_url'],
        config=Config(signature_version='oauth', ),
        endpoint_url=dataset_metadata['url']
    )
    print('+ client created.')

    # get the data
    response_obj = s3.get_object(Bucket=dataset_metadata['bucket'], Key=filename)
    print('+ object received.')
    # capture object info
    content_type = response_obj['ResponseMetadata']['HTTPHeaders']['content-type']
    content_length = int(response_obj['ResponseMetadata']['HTTPHeaders']['content-length'])
    print('content type & length:', content_type, content_length)# "({} MB)".format(content_length/1024))
    
    streaming_body = response_obj['Body']
    streaming_body.set_socket_timeout(999999) # set a long timeout

    chunks = 10
    chunksize = content_length // chunks
    print(chunks,'chunks of size:', chunksize)
    print('Leftover bytes: ', content_length - (chunks*chunksize))

    # read the streaming_body object with the bz2
    bz_file = bz2.BZ2File(streaming_body, 'rb')

    count = 0
    while True:
        print('procesing chunk #', count)
        content = bz_file.readlines(chunksize)
        if not content:
            break

        content_json = [json.loads(line.decode('utf8').strip('\n')) for line in content]
        print('> lines read:', len(content_json))
        print('> file position: ', bz_file.tell())
        # process the `content_json` code goes below...

        # NOTE: EXAMPLE: convert to pandas dataframe and append to a file, show info as we process.
        content_df = pd.DataFrame(content_json)
        print('> dataframe made, shape: ', content_df.shape, ' memory: ', content_df.memory_usage().sum())
        content_df.to_csv(filename.replace('.bz2', '.csv'), mode='a', index=False, header=False)
        print('++ appended to CSV file.')

        del content, content_json # hopefully will free up some space
        count += 1 # iter chunk counter

    print(f'[+] processing completed. Total chunks read: {count} of size {int(chunksize/1024/1024)} MB')
    print("** DONE ** ")


if __name__ == "__main__":
    filename = <your-filename>

    # process the file in chunks
    main(filename)


Simas
  • 642
  • 8
  • 15