I'm trying to make a tf.data.Dataset instance for a medical image instance segmentation pipeline, but the output of the masks keep getting expanded with 1 in the first dimension.
def createDataset(df, mode="segment"):
""" Want this to take in a df if we want to train => output Dataset object
Return:
- tf.data.Dataset instance where the image and the mask
are in [0, 255] range, both with shape OUTPUT_SHAPE.
"""
assert mode in ["classify", "segment"], "Please set mode as one of ['classify', 'segment']."
AUTOTUNE = tf.data.AUTOTUNE
if df is not None:
folder = df.file_path.iloc[0].split(os.path.sep)[-2]
files_in_folder = glob.glob(os.path.join(DIR, f"{folder}/*"))
unique_file_paths = df.file_path.unique()
# create a tf.data.Dataset instance from only the file_path, then
# read images and masks/labels if folder isn't test
if df is None:
dataset = tf.data.Dataset.from_tensor_slices(glob.glob(os.path.join(DIR, f"test/*")))
dataset = dataset.map(readTFImage, num_parallel_calls=AUTOTUNE)
if folder == "train":
path_list = [path for path in files_in_folder if path in unique_file_paths]
dataset = tf.data.Dataset.from_tensor_slices(path_list)
if mode == "segment":
dataset = dataset.map(
lambda x: (readTFImage(x), tf.numpy_function(getMask, [x], [tf.float32])),
num_parallel_calls=AUTOTUNE)
elif mode == "classify":
dataset = dataset.map(
lambda x: (readTFImage(x), tf.numpy_function(getClassificationLabel, [x], [tf.float32])),
num_parallel_calls=AUTOTUNE)
# After mapping readTFImage output follows OUTPUT_SHAPE, but the masks/labels all have dimension
# (None, 1, OUTPUT_SHAPE) in the case of segmentation masks and (3,) -> (None, 1, 3) for the labels.
else:
raise Exception(f"Folder {folder} not found.")
return dataset
Here the shape goes from OUTPUT_SHAPE to (None, 1, OUTPUT_SHAPE) for the masks and (class_number,) to (None, 1, class_number) in the labels, whereas if I use the same code, but read the masks outside of tf.data.Dataset I can ensure the shape stays the same before and after inputting into the pipeline:
def createDataset(df, mode="segment"):
""" Want this to take in a df if we want to train => output Dataset object
Return:
- tf.data.Dataset instance where the image and the mask
are in [0, 255] range, both with shape OUTPUT_SHAPE.
"""
assert mode in ["classify", "segment"], "Please set mode as one of ['classify', 'segment']."
AUTOTUNE = tf.data.AUTOTUNE
if df is not None:
folder = df.file_path.iloc[0].split(os.path.sep)[-2]
files_in_folder = glob.glob(os.path.join(DIR, f"{folder}/*"))
unique_file_paths = df.file_path.unique()
# create a tf.data.Dataset instance from only the file_path, then
# read images and masks/labels if folder isn't test
if df is None:
dataset = tf.data.Dataset.from_tensor_slices(glob.glob(os.path.join(DIR, f"test/*")))
dataset = dataset.map(readTFImage, num_parallel_calls=AUTOTUNE)
if folder == "train":
path_list = [path for path in files_in_folder if path in unique_file_paths]
if mode == "segment":
target_data = [getMask(path) for path in path_list]
elif mode == "classify":
target_data = [getClassificationLabel(path) for path in path_list]
dataset = tf.data.Dataset.from_tensor_slices((path_list, target_data))
dataset = dataset.map(
lambda x, y: (readTFImage(x), y),
num_parallel_calls=AUTOTUNE)
else:
raise Exception(f"Folder {folder} not found.")
return dataset
Here the shape stays as (None, OUTPUT_SHAPE) and (None, class_number). I have tested getMasks and getClassificationLabels separately and can confirm that they work properly, and no change in dimension occurred after. Both functions rely on checking the file path against a DataFrame for getting the masks and labels, since the masks are run length encoded.