I created pipeline using tf.data API, for reading data set of images. I have a big dataset with high resolution. However, each time trying to reading all the dataset, the computer crash because the code using all the RAM. I tested the code with about 1280 images, it works without any error. But when I used all the datasets the model craches. So, I am wondering if there is a way to make tf.data read a one or two batch in front not more than that.
This the code I am using to create the pipeline:
def decode_img(self, img):
img = tf.image.convert_image_dtype(img, tf.float32, saturate=False)
img = tf.image.resize(img, size=self.input_dim, antialias=False, name=None)
return img
def get_label(self, label):
y = np.zeros(self.n_class, dtype=np.float32)
y[label] = 1
return y
def process_path(self, file_path, label):
label = self.get_label(label)
img = Image.open(file_path)
width, height = img.size
# Setting the points for cropped image
new_hight = height // 2
new_width = width // 2
newsize = (new_width, new_hight)
img = img.resize(newsize)
if self.aug_img:
img = self.policy(img)
img = self.decode_img(np.array(img, dtype=np.float32))
return img, label
def create_pip_line(self):
def _fixup_shape(images, labels):
images.set_shape([None, None, 3])
labels.set_shape([7]) # I have 19 classes
return images, labels
tf_ds = tf.data.Dataset.from_tensor_slices((self.df["file_path"].values, self.df["class_num"].values))
tf_ds = tf_ds.map(lambda img, label: tf.numpy_function(self.process_path,
[img, label],
(tf.float32, tf.float32)),
num_parallel_calls=tf.data.experimental.AUTOTUNE)
tf_ds = tf_ds.map(_fixup_shape)
if not self.is_val:
tf_ds = tf_ds.shuffle(len(self.df), reshuffle_each_iteration=True)
tf_ds = tf_ds.batch(self.batch_size).repeat(self.epoch_num)
self.tf_ds = tf_ds.prefetch(tf.data.experimental.AUTOTUNE)