The image dataset contains approximately 70,000 images. I tried to create 6 pickle files out of them. Therefore, each pickle dump file contains around 11519 pre-processed images. The image folder size on the disk is approximately 2.5G. However, each pickle file is generated with a size of 6.45G.
In total 6.45 * 6 = 38.7 G!!
Is this the expected behavior or I am doing something wrong? I used the following script to generate pickle files.
import os
import pickle
import numpy as np
from PIL import Image
n = 11519
images = []
DATADIR = r"path/to/images"
def data_preprocessing(samples, img_size):
img_width, img_height, channels = img_size, img_size, 3
image_list = [np.array((Image.open(os.path.join(DATADIR, filename)).convert('RGB')).resize((img_width, img_height), Image.ANTIALIAS)) for filename in samples]
images = np.asarray(image_list).astype('float32') / 255.0
ds_samples = np.reshape(images, (len(samples), img_width, img_height, channels))
return ds_samples
def divide_chunks(samples, n):
# looping till length samples
for i in range(0, len(samples), n):
yield samples[i:i + n]
# Appending files from directory to list
for file in (os.listdir(DATADIR)):
images.append(file)
# Creating parts
sub_samples = list(divide_chunks(images, n))
# Saving pickle files
for chunk in range(len(sub_samples)):
outfile = open("sample_{}.pkl".format(chunk), 'wb')
pickle.dump(data_preprocessing(sub_samples[chunk], 224), outfile, protocol=4)
outfile.close()