Memory Leak with tf.Dataset

Question

I have done a pipeline to read my data on file to a tf.data.Dataset. The issue is that for each epoch, memory is accumulated. After a while the training is killed. I have tried to reduce the number of images shuffled. Tweak the number of parallel calls, but no success with anything.

On iteration 1 the memory consumption is ~8 GB and after 10 epochs it is ~15 GB.

This is how my pipeline looks like:

Edit I tried this instead:

def getDataset(data_root_path: Path, is_training: bool) -> tf.data.Dataset:
    dirs = [x for x in data_root_path.iterdir() if x.is_dir()]
    datasets = []
    for dir in dirs:
        annotation_path = dir / "annotations.json"
        annotation = tf.io.read_file(str(annotation_path))
        classes_path = dir / "classes.json"
        classes = tf.io.read_file(str(classes_path))
        dataset = createDataset(
            dir,
            annotation,
            classes,
            tf.cast(str(annotation_path), dtype=tf.string),
            is_training=is_training,
        )
        datasets.append(dataset)
    result = datasets[0]
    for k in range(1, len(datasets)):
        result = result.concatenate(datasets[k])
    return result.shuffle(1000).prefetch(tf.data.AUTOTUNE).batch(8)

That worked for almost 300 epochs. But in the end the memory was exhausted.

Edit2 Tried this LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4 python Python/LineDetection/src/lineextractor.py and it has not crashed yet. But the RAM is almost full. This solution was suggested by: https://github.com/tensorflow/tensorflow/issues/44176#issuecomment-830331981

def getDataset(data_root_path: Path, is_training: bool) -> tf.data.Dataset:
    dirs = [x for x in data_root_path.iterdir() if x.is_dir()]
    datasets = []
    for dir in dirs:
        annotation_path = dir / "annotations.json"
        annotation = tf.io.read_file(str(annotation_path))
        classes_path = dir / "classes.json"
        classes = tf.io.read_file(str(classes_path))
        dataset = createDataset(
            dir,
            annotation,
            classes,
            tf.cast(str(annotation_path), dtype=tf.string),
            is_training=is_training,
        )
        datasets.append(dataset)
    return (
        tf.data.Dataset.sample_from_datasets(datasets)
        .shuffle(100)
        .batch(8)
    )


def createDataset(
    dir: Path,
    annotation: tf.string,
    classes: tf.string,
    annotation_path: tf.string,
    is_training: bool,
) -> tf.data.Dataset:
    image_path_png = str(dir / "images" / "*.png")
    image_path_PNG = str(dir / "images" / "*.PNG")
    image_path_jpg = str(dir / "images" / "*.jpg")
    image_path_JPG = str(dir / "images" / "*.JPG")
    image_path_jpeg = str(dir / "images" / "*.jpeg")
    image_path_JPEG = str(dir / "images" / "*.JPEG")
    image_dirs = [
        image_path_png,
        image_path_PNG,
        image_path_jpg,
        image_path_JPG,
        image_path_jpeg,
        image_path_JPEG,
    ]

    dataset = (
        tf.data.Dataset.list_files(image_dirs)
        .filter(lambda x: is_in_split(x, is_training))
        .map(
            lambda x: create_image_and_annotation(
                x, annotation, classes, annotation_path
            ), num_parallel_calls=1
        )
        .map(resize_image, num_parallel_calls=1)
        .map(rescale_to_image_size, num_parallel_calls=1)
    )

    return dataset


def create_image_and_annotation(
    image_path: tf.string,
    annotation: tf.string,
    classes: tf.string,
    annotation_path: tf.string,
) -> Annotation:
    bits = tf.io.read_file(image_path)
    file_split = tf.strings.split(image_path, "/")
    image_name = file_split[-1]
    suffix = tf.strings.split(image_name, ".")[-1]

    jpeg = [
        tf.convert_to_tensor("jpg", dtype=tf.string),
        tf.convert_to_tensor("JPG", dtype=tf.string),
        tf.convert_to_tensor("jpeg", dtype=tf.string),
        tf.convert_to_tensor("JPEG", dtype=tf.string),
    ]
    is_jpeg = [tf.math.equal(suffix, s) for s in jpeg]
    png = [
        tf.convert_to_tensor("png", dtype=tf.string),
        tf.convert_to_tensor("PNG", dtype=tf.string),
    ]
    if tf.math.reduce_any(is_jpeg):
        image_shape = tf.io.extract_jpeg_shape(bits)
        image = tf.io.decode_jpeg(bits, channels=3)
    else:
        image = tf.io.decode_png(bits, channels=3)
        image_shape = tf.io.extract_jpeg_shape(tf.io.encode_jpeg(image))
    lines = tf.py_function(
        create_lines,
        inp=[annotation, image_name, classes, image_shape, annotation_path],
        Tout=tf.float32,
    )
    return Annotation(image, lines)


def create_lines(
    annotation: tf.string,
    image_name: tf.string,
    classes: tf.string,
    image_shape: tf.Tensor,
    annotation_path: tf.string,
) -> tf.Tensor:
    annotation_py = annotation.numpy()
    annotation_json = json.loads(annotation_py)
    key_py = image_name.numpy().decode("utf-8")
    im_shape = image_shape.numpy()
    class_py = classes.numpy()
    class_json = json.loads(class_py)

    translate_classes_id = {}
    for (k, val) in enumerate(class_json):
        translate_classes_id[val["id"]] = k
    lines = np.zeros((parameters.nbr_of_lines, 5))
    present_labels = getPresentLabels(annotation_json, key_py, translate_classes_id)
    head_sides = []
    leg_sides = []
    if key_py in annotation_json.keys():
        for annotation in annotation_json[key_py]["instances"]:
            points = annotation["points"]
            if not annotation["classId"] in translate_classes_id:
                tf.print("Invalid class Id: {}".format(annotation["classId"]))
                tf.print(annotation_path)
                import pdb

                pdb.set_trace()
                continue
            current_class_id = translate_classes_id[annotation["classId"]]
            points = reorder_points(
                points, label_order[current_class_id], present_labels
            )
            (w, h, scale) = rescaled_size(im_shape)
            offset = (
                (parameters.input_shape[0] - h) / 2,
                (parameters.input_shape[1] - w) / 2,
            )
            normalized_points = [
                (points[0] * scale + offset[1]) / parameters.input_shape[1],
                (points[1] * scale + offset[0]) / parameters.input_shape[0],
                (points[2] * scale + offset[1]) / parameters.input_shape[1],
                (points[3] * scale + offset[0]) / parameters.input_shape[0],
            ]
            if label_order[current_class_id] == "head_side":
                head_sides.append(normalized_points)
            elif label_order[current_class_id] == "leg_side":
                leg_sides.append(normalized_points)
            else:
                row = row_order[label_order[current_class_id]]
                lines[row, 0:4] = normalized_points
                lines[row, 4] = 1

    for k, head_side in enumerate(head_sides):
        lines[k + row_order["head_side"], 0:4] = head_side
        lines[k + row_order["head_side"], 4] = 1

    for k, leg_side in enumerate(leg_sides):
        lines[k + row_order["leg_side"], 0:4] = leg_side
        lines[k + row_order["leg_side"], 4] = 1

    return tf.convert_to_tensor(lines, tf.float32)


def resize_image(annotation: Annotation):
    image = tf.image.resize_with_pad(
        annotation.image,
        parameters.input_shape[0],
        parameters.input_shape[1],
        method=ResizeMethod.BILINEAR,
    )
    return Annotation(image, annotation.lines)


def is_in_split(image_path: tf.string, is_training: bool) -> bool:
    hash = tf.strings.to_hash_bucket_fast(image_path, 10)
    if is_training:
        return hash < 8
    else:
        return hash >= 8


def rescale_points(tf_pts) -> tf.Tensor:
    pts = tf_pts.numpy()
    pts[:, 0] *= parameters.input_shape[1]
    pts[:, 1] *= parameters.input_shape[0]
    pts[:, 2] *= parameters.input_shape[1]
    pts[:, 3] *= parameters.input_shape[0]
    invalid_rows = pts[:, 4] == 0
    pts[invalid_rows, 0:4] = -100
    return tf.constant(pts, dtype=tf.float32)


def rescale_to_image_size(annotation: Annotation) -> Annotation:
    rescaled_points = tf.py_function(
        rescale_points, inp=[annotation.lines], Tout=tf.float32
    )
    return (annotation.image, rescaled_points)

Memory Leak with tf.Dataset

0 Answers0