I am using this sequence to read images files from disk and feed into a TF Keras model.
#Make dataset for training
dataset_train = tf.data.Dataset.from_tensor_slices((file_ids_training,file_names_training))
dataset_train = dataset_train.flat_map(lambda file_id,file_name: tf.data.Dataset.from_tensor_slices(
tuple (tf.py_func(_get_data_for_dataset, [file_id,file_name], [tf.float32,tf.float32]))))
dataset_train = dataset_train.cache()
dataset_train= dataset_train.shuffle(buffer_size=train_buffer_size)
dataset_train= dataset_train.batch(train_batch_size) #Make dataset, shuffle, and create batches
dataset_train= dataset_train.repeat()
dataset_train = dataset_train.prefetch(1)
dataset_train_iterator = dataset_train.make_one_shot_iterator()
get_train_batch = dataset_train_iterator.get_next()
I am having questions on whether this is the most optimal sequence. For e.g. Should repeat come after shuffle() and before batch()?, Should cache() come after batch?