I wrote a generator for Keras that uses Pytables for getting images from an HDF5 file (see code below). It works fine, when calling it like so:
self._model.fit_generator(self.training_generator,
epochs=epochs,
validation_data=self.validation_generator,
verbose=1,
callbacks=[model_checkpoint, tensorboard_callback],
use_multiprocessing=True,
# workers=2 # uncommenting this and using more than 1 worker fails
)
However if I use multiple workers (see the commented line above), I get the error shown below. I suspect, that this is related to multiple threads attempting to access the HDF5 file. However, I thought that Pytables and HDF5 is able to handle this for read-only access. So what am I doing wrong?
Bonus-question: Will this code make sure, that during training the model sees a given sample only once for an epoch as mentioned here under Notes?:
Sequence are a safer way to do multiprocessing. This structure guarantees that the network will only train once on each sample per epoch which is not the case with generators.
This is the error that I get using more than one workers:
multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "/usr/lib/python3.7/multiprocessing/pool.py", line 121, in worker
result = (True, func(*args, **kwds))
File "/project/path/venv/lib/python3.7/site-packages/keras/utils/data_utils.py", line 401, in get_index
return _SHARED_SEQUENCES[uid][i]
File "/project/path/python_package/python_package/training_generators.py", line 41, in __getitem__
images, masks, weights = self.__data_generation(indexes)
File "/project/path/python_package/python_package/training_generators.py", line 52, in __data_generation
images, labels = self.__get_images(indexes)
File "/project/path/python_package/python_package/training_generators.py", line 79, in __get_images
labels[counter] = self.tables.root['labels'][i, ...]
File "/project/path/venv/lib/python3.7/site-packages/tables/array.py", line 662, in __getitem__
arr = self._read_slice(startl, stopl, stepl, shape)
File "/project/path/venv/lib/python3.7/site-packages/tables/array.py", line 766, in _read_slice
self._g_read_slice(startl, stopl, stepl, nparr)
File "tables/hdf5extension.pyx", line 1585, in tables.hdf5extension.Array._g_read_slice
tables.exceptions.HDF5ExtError: HDF5 error back trace
File "H5Dio.c", line 216, in H5Dread
can't read data
File "H5Dio.c", line 587, in H5D__read
can't read data
File "H5Dchunk.c", line 2276, in H5D__chunk_read
error looking up chunk address
File "H5Dchunk.c", line 3022, in H5D__chunk_lookup
can't query chunk address
File "H5Dbtree.c", line 1047, in H5D__btree_idx_get_addr
can't get chunk info
File "H5B.c", line 341, in H5B_find
unable to load B-tree node
File "H5AC.c", line 1763, in H5AC_protect
H5C_protect() failed
File "H5C.c", line 2565, in H5C_protect
can't load entry
File "H5C.c", line 6890, in H5C_load_entry
Can't deserialize image
File "H5Bcache.c", line 181, in H5B__cache_deserialize
wrong B-tree signature
End of HDF5 error back trace
Problems reading the array data.
"""
This is the code of my generator:
class DataGenerator(keras.utils.Sequence):
'Generates data for Keras'
def __init__(self, pytables_file_path=None, batch_size=32, shuffle=True, image_processor: ImageProcessor = None,
augment_params=None, image_type=None):
'Initialization'
self.batch_size = batch_size
self.image_type = image_type
self.pytable_file_path = pytables_file_path
self.tables = tables.open_file(self.pytable_file_path, 'r')
self.number_of_samples = self.tables.root[self.image_type].shape[0]
self.image_size = self.tables.root[self.image_type].shape[1:]
self.indexes = list(range(self.number_of_samples))
self.shuffle = shuffle
self.image_processor = image_processor
self.on_epoch_end()
self.augment_params = augment_params
def __del__(self):
self.tables.close()
def __len__(self):
'Denotes the number of batches per epoch'
return int(np.floor(self.number_of_samples / self.batch_size))
def __getitem__(self, index):
'Generate one batch of data'
# Generate indexes of the batch
indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
# Generate data
images, masks, weights = self.__data_generation(indexes)
mask_wei_arr = np.concatenate((masks, weights[:, :, :, np.newaxis]), axis=-1)
return (images, mask_wei_arr)
def on_epoch_end(self):
"""Run after each epoch."""
if self.shuffle:
np.random.shuffle(self.indexes) # Shuffle indexes after each epoch
def __data_generation(self, indexes):
'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
images, labels = self.__get_images(indexes)
if self.image_processor:
images = self.__process_images(images)
masks, weights = self.generate_masks_and_weights_from_labels(labels)
if self.augment_params:
[images, masks, weights] = self.augment_data(images, masks, weights)
images = images.astype('float32')
masks_new = masks.astype('float32')
weights_new = weights.astype('float32')
weights_new = weights_new[:, :, :, 0]
return images, masks_new, weights_new
def __process_images(self, images):
for ind, image in enumerate(images):
images[ind, ...] = self.image_processor.process(image)
return images
def __get_images(self, indexes):
images = np.empty((self.batch_size, *self.image_size))
labels = np.empty((self.batch_size, *self.image_size))
for counter, i in enumerate(indexes):
current_image = self.tables.root[self.image_type][i, ...]
images[counter] = current_image
labels[counter] = self.tables.root['labels'][i, ...]
return images, labels
def generate_masks_and_weights_from_labels(self, labels):
pass
max_lbl_val = int(np.max(labels))
edges = np.zeros_like(labels).astype(bool)
masks = np.asarray(labels > 0).astype(float)
weights = np.ones_like(labels)
se_size = 3 # use '3': to get 1 pixel dilation; use '5': to get 2 pixel dilation
structure = np.ones((1, se_size, se_size, 1))
for lbl_ind in range(1, max_lbl_val+1): # iterate over labels
label_mask = labels == lbl_ind
label_dilated_edges = scipy.ndimage.morphology.binary_dilation(label_mask, structure) & ~label_mask
label_eroded_edges = ~scipy.ndimage.morphology.binary_erosion(label_mask, structure) & label_mask
label_edges = np.bitwise_or(label_eroded_edges, label_dilated_edges)
edges = np.bitwise_or(edges, label_edges)
weights[edges] *= 10 # weight the edges more by factor 10
return masks, weights
def augment_data(self, images, masks, weights):
# for index, _ in enumerate(images):
# [images[index, :, :, 0], masks[index, :, :, 0], weights[index, :, :, 0]] = data_augmentation(
# [images[index, :, :, 0], masks[index, :, :, 0], weights[index, :, :, 0]], self.augment_params,
# order=[1, 0, 0])
for index, image in enumerate(images):
image = images[index, ...]
mask = masks[index, ...]
weight = weights[index, ...]
[image, mask, weight] = data_augmentation([image, mask, weight], self.augment_params, order=[1, 0, 0])
# fix, ax = plt.subplots(1, 3, figsize=(5, 15))
# ax[0].imshow(image[:, :, 0])
# ax[1].imshow(mask[:, :, 0])
# ax[2].imshow(weight[:, :, 0])
# plt.show()
images[index, ...] = image
masks[index, ...] = mask
weights[index, ...] = weight
return images, masks, weights