I'm working on an image classification project using the Snapshot Serengeti dataset. The dataset comes with a single very large JSON file (5GB+) that contains for top level keys. I specifically need the values contained in the "images": [{...}, {...}, ...] array for training. The file's too large for me to open directly and read from or to store to a dictionary.
The image entries in the file are formatted like this:
{
"id": "S1/B04/B04_R1/S1_B04_R1_PICT0003",
"file_name": "S1/B04/B04_R1/S1_B04_R1_PICT0003.JPG",
"frame_num": 1,
"seq_id": "SER_S1#B04#1#3",
"width": 2048,
"height": 1536,
"corrupt": false,
"location": "B04",
"seq_num_frames": 1,
"datetime": "2010-07-20 06:14:06"
},
I've tried to loop over the file in 100MB chunks, but the file also has formatting issues (single quotes, NaN values) that need to be addressed first or errors are thrown. The code I tried was below
with open(labels_json) as f:
for chunk in iter(lambda: f.read(100*1024*1024), ""):
data = json.loads(chunk)
As the images are organized into 11 seasons, I tried to instead write the data to 11 separate files that can be loaded individually with the script below, but the cloud storage gets eaten up before even a single season is stored. I'm new to data storage issues like this so there could definitely be an issue in my script that's causing the file to be written inefficiently. Any help would be very much appreciated.
import json
labels_json = annotations_directory + "SS_Labels.json"
get_filename = lambda n : f"SS_labels_S{i}.json"
# Define the 11 output files
seasons = {}
started = {}
for i in range(1, 12):
filename = get_filename(i)
seasons[i] = open(filename, "w")
seasons[i].write('[')
started[i] = False
def seperate_seasons(dir):
line_num = 0
decoder = json.JSONDecoder()
with open(dir, 'r') as labels:
begin_writing = False
buffer = []
id = 1
for line in labels:
if not begin_writing: # Begin writing for the line after "images"
if 'images' in line:
begin_writing = True
else:
line.replace('NaN', 'null') # clean NaN values
line.replace("'", '"') # clean incorrect key values
buffer.append(line.strip()) # add line to buffer
getID = lambda l: int(line.split('"')[3].split('/')[0][1])
if '"id"' in line or "'id'" in line:
previous_id = id
id = getID(line) # get id of object
if line.strip() == '},' or line.strip() == '}': # when the object has finished, write it to the appropriate image folder
label = ','.join(buffer)
if label[-1] != ',':
label += ','
if started[id] == False:
print(f'Beginning Season {id}')
started[id] = True
if id != 1:
seasons[previous_id].write(']')
seasons[previous_id].close()
del seasons[previous_id]
seasons[id].write(label) # add label entry to file
seperate_seasons(labels_json)
# Close all remaining label files
for season in seasons.values():
season.write(']')
season.close()