MultiClass Object Detection and Classification using Fast R-CNN

Question

I'm trying to make model (VGG-16) that uses Fast R-CNN for object detection. In short, I want to find object on image and put bounding box where object is.

I already tried multiple way's of getting that, but all the time I'm getting some error's, basically most of them are with RoiPoolingLayer and loss function's.

Can you guys guide what I'm doing wrong?

So let me introduce you:

This is my code atm:

import pickle
import numpy
import tensorflow
from keras import Input, Model
from keras.initializers.initializers_v1 import RandomNormal
from keras.layers import Flatten, TimeDistributed, Dense, Dropout

from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.optimizers import Adam
from tensorflow.python.keras.regularizers import l2

from data import get_data, get_train_data
from rcnn.config import Config


import tensorflow as tf
from tensorflow.keras.layers import Layer


class RoiPoolingConv(Layer):

    def __init__(self, pool_size, **kwargs):
        self.pool_size = pool_size
        super(RoiPoolingConv, self).__init__(**kwargs)

    def build(self, input_shape):
        self.nb_channels = input_shape[0][3]
        super(RoiPoolingConv, self).build(input_shape)

    def compute_output_shape(self, input_shape):
        return None, None, self.pool_size, self.pool_size, self.nb_channels

    def crop_and_resize(self, image, boxes):
        box_ind = tf.range(tf.shape(boxes)[0])
        box_ind = tf.reshape(box_ind, (-1, 1))
        box_ind = tf.tile(box_ind, [1, tf.shape(boxes)[1]])
        boxes = tf.keras.backend.cast(
            tf.reshape(boxes, (-1, 4)), "float32"
        )
        box_ind = tf.reshape(box_ind, (1, -1))[0]
        result = tf.image.crop_and_resize(image, boxes, box_ind, [self.pool_size, self.pool_size])
        result = tf.reshape(result, (tf.shape(image)[0], -1, self.pool_size, self.pool_size, self.nb_channels))
        return result

    def call(self, x, mask=None):
        assert (len(x) == 2)
        img = x[0]
        rois = x[1]

        print(x)
        print(img)
        print(rois)

        x1 = rois[:, 0]
        y1 = rois[:, 1]
        x2 = rois[:, 2]
        y2 = rois[:, 3]

        boxes = tf.stack([y1, x1, y2, x2], axis=-1)
        print(boxes)
        rs = self.crop_and_resize(img, boxes)
        print(rs)
        return rs

    def get_config(self):
        config = {'pool_size': self.pool_size}
        base_config = super(RoiPoolingConv, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))


PROPERTIES = Config()


def prepare_model(
        model_path="model\\FastRCNN.h5"
):
    roi_input = Input(shape=(None, 4), name="input_2")
    model_cnn = tensorflow.keras.applications.VGG16(
        include_top=True,
        weights='imagenet'
    )

    model_cnn.trainable = True
    x = model_cnn.layers[17].output
    x = RoiPoolingConv(7)([x, roi_input])
    x = TimeDistributed(Flatten())(x)

    softmaxhead = Dense(4096, activation='relu', kernel_initializer=RandomNormal(stddev=0.01), kernel_regularizer=l2(0.0005), bias_regularizer=l2(0.0005))(x)
    softmaxhead = Dropout(0.5)(softmaxhead)
    softmaxhead = Dense(4096, activation='relu', kernel_initializer=RandomNormal(stddev=0.01), kernel_regularizer=l2(0.0005), bias_regularizer=l2(0.0005))(softmaxhead)
    softmaxhead = Dropout(0.5)(softmaxhead)
    softmaxhead = Dense(20, activation='softmax', kernel_initializer='zero', name='class_label')(softmaxhead)

    bboxhead = Dense(128, activation='relu')(x)
    bboxhead = Dense(64, activation='relu')(bboxhead)
    bboxhead = Dense(32, activation='relu')(bboxhead)
    bboxhead = Dense(4, activation='sigmoid', name='bounding_box')(bboxhead)

    model_final = Model(inputs=[model_cnn.input, roi_input], outputs=(bboxhead, softmaxhead))
    opt = Adam(learning_rate=0.0001)
    losses = {
        "class_label": PROPERTIES.CLASS_LABEL_LOSSES,
        "bounding_box": PROPERTIES.BOUNDING_BOX_LOSSES
    }
    lossWeights = {
        "class_label": PROPERTIES.LOSS_WEIGHTS,
        "bounding_box": PROPERTIES.LOSS_WEIGHTS
    }

    model_final.compile(
        loss=losses,
        optimizer=opt,
        metrics=["accuracy"],
        loss_weights=lossWeights
    )
    tensorflow.keras.utils.plot_model(
        model_final,
        "model.png",
        show_shapes=True,
        show_layer_names=False,
        rankdir='TB'
    )
    model_final.save(model_path)
    return model_final


def train_RCNN_VGG(path):
    # get voc data
    all_data, classes_count, class_mapping = get_data(path)
    tr_images, tr_labels_rois, tr_bboxes_rois, tr_bboxes_gt = get_train_data(all_data)
    #val_images, val_labels, val_bboxes = get_validation_data(all_data)

    # delete unnecessary data
    del classes_count
    del class_mapping
    del all_data

    # convert to numpy array
    tr_images = numpy.array(tr_images, dtype="float32")
    tr_bboxes_rois = numpy.array(tr_bboxes_rois, dtype="float32")
    tr_bboxes_gt = numpy.array(tr_bboxes_gt, dtype="float32")
    tr_labels_rois = numpy.array(tr_labels_rois)
    print(tr_images.shape)
    print(tr_bboxes_rois.shape)
    print(tr_bboxes_gt.shape)
    print(tr_labels_rois.shape)
    # same for validation data
    #val_images = numpy.array(val_images, dtype="float32")
    #val_bboxes = numpy.array(val_bboxes, dtype="float32")
    #val_labels = numpy.array(val_labels)

    # use label binarizer for signing which class/label if for image
    labelBinarizer = LabelBinarizer()
    tr_labels_rois = labelBinarizer.fit_transform(tr_labels_rois)
    #val_labels = labelBinarizer.fit_transform(val_labels)
    classes = len(labelBinarizer.classes_)

    # load model, provide number of classes
    #model_vgg = load_model_or_construct(classes)
    model_vgg = prepare_model()

    # define a dictionary to set the loss methods
    losses = {
        "class_label": PROPERTIES.CLASS_LABEL_LOSSES,
        "bounding_box": PROPERTIES.BOUNDING_BOX_LOSSES
    }

    # define a dictionary that specifies the weights per loss
    lossWeights = {
        "class_label": PROPERTIES.LOSS_WEIGHTS,
        "bounding_box": PROPERTIES.LOSS_WEIGHTS
    }

    # initialize the optimizer, compile the model, and show the model
    opt = Adam(learning_rate=PROPERTIES.LEARNING_RATE)
    model_vgg.compile(loss=losses, optimizer=opt, metrics=["accuracy"], loss_weights=lossWeights)

    # construct a dictionary for our target training outputs, for our target testing
    trainTargets = {
        "class_label": tr_labels_rois,
        "bounding_box": tr_bboxes_gt
    }
    #validationTargets = {
    #    "class_label": val_labels,
    #    "bounding_box": val_bboxes
    #}

    # train the network for bounding box regression and class label
    H = model_vgg.fit(
        [tr_images, tr_bboxes_rois], trainTargets,
    #    validation_data=(val_images, validationTargets),
        batch_size=PROPERTIES.BATCH_SIZE,
        epochs=PROPERTIES.EPOCHS,
        verbose=PROPERTIES.VERBOSE)

    # save model, print summary
    model_vgg.save(PROPERTIES.RCNN_MODEL_NAME, save_format=PROPERTIES.RCNN_MODEL_FORMAT)
    model_vgg.summary()

    # save binarizer
    f = open(PROPERTIES.BINARIZER_NAME, "wb")
    f.write(pickle.dumps(labelBinarizer))
    f.close()


if __name__ == '__main__':
    # load rcnn
    train_RCNN_VGG(PROPERTIES.DATASET_PATH)

I'm creating RoiPooling Layer, VGG-16 architecture, loading pre-trained weights, making my own output layers, cause I have 20 classes (basing on VOC Data from 2012) that's why first output has 20, second has 4 - cause of bounding box's coordinates.

In train method, you can see I'm printing shape's of data I'm delivering, they are:

(1048, 224, 224, 3)
(1048, 4)
(1048, 4)
(1048,)

First one, it's 1048 images of 224x224 rgb Second, it's 1048 rois coordinates prepared for 224x224 Third, it's 1048 ground truth's bboxes Fourth, it's 1048 times 20 labels. Label's are like this: [[0, 0, 0, 0, 0, 0, ... 1, 0, 0,](19's zeros, and one 1 - correct label), [0, ....]]

I was basing on this: https://www.pyimagesearch.com/2020/10/12/multi-class-object-detection-and-bounding-box-regression-with-keras-tensorflow-and-deep-learning/

Currently I have this error:

Traceback (most recent call last):
  File "C:\Users\Karol\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "C:\Users\Karol\anaconda3\lib\site-packages\tensorflow\python\framework\func_graph.py", line 1129, in autograph_handler
    raise e.ag_error_metadata.to_exception(e)
ValueError: in user code:
    File "C:\Users\Karol\anaconda3\lib\site-packages\keras\engine\training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\Karol\anaconda3\lib\site-packages\keras\engine\training.py", line 867, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Karol\anaconda3\lib\site-packages\keras\engine\training.py", line 860, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\Karol\anaconda3\lib\site-packages\keras\engine\training.py", line 809, in train_step
        loss = self.compiled_loss(
    File "C:\Users\Karol\anaconda3\lib\site-packages\keras\engine\compile_utils.py", line 201, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\Karol\anaconda3\lib\site-packages\keras\losses.py", line 141, in __call__
        losses = call_fn(y_true, y_pred)
    File "C:\Users\Karol\anaconda3\lib\site-packages\keras\losses.py", line 245, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\Karol\anaconda3\lib\site-packages\keras\losses.py", line 1664, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "C:\Users\Karol\anaconda3\lib\site-packages\keras\backend.py", line 4994, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)
    ValueError: Shapes (None, 20) and (None, None, 20) are incompatible
python-BaseException

So, my question is: What am I missing, is my preprocessing-data incorrect? I'm trying to teach my model recognition 20 classes and pointing where on image this object probably is. But I have to make wrong data delivering I guess.

Just to make something clear, I'm using categorical cross entropy and mean average precision for "class label" and "bounding boxes".

Maybe I'm just using wrong loss function's? Please help.

score 0 · Answer 1 · answered Nov 29 '21 at 18:49

0

Try to use the loss tf.keras.losses.SparseCategoricalCrossEntropy instead, and make sure you have labels in one hot encoding format, for the reasons pointed here:

Getting a ValueError in tensorflow saying that my shapes are incompatible

answered Nov 29 '21 at 18:49

Guinther Kovalski

1,629
1
7
15

1

Well, first of all I want to thank you for your answer, but... It didn't solve my problem or rather solved one thing, but second is still not good. Now I've got an error with with crop_and_resize: ` tensorflow.python.framework.errors_impl.InvalidArgumentError: box_index has incompatible shape [[node model/roi_pooling_conv/CropAndResize (defined at C:/Users/Karol/PycharmProjects/RCNN-Fast/train.py:68) ]] ` I have labels in one hot encoding, here are the shapes: (1048, 224, 224, 3) (1048, 4) (1048, 4) (1048, 20) What am I doing wrong, then? Something with data providing? – Karol E. Mikołajczuk Nov 29 '21 at 21:15
Hi Karol E. Mikołajczuk, this seens to be the issue: https://github.com/matterport/Mask_RCNN/issues/1458 One possible solution is: config = tensorflow.ConfigProto() config.inter_op_parallelism_threads = 1 keras.backend.set_session(tensorflow.Session(config=config)) I would recommend you to accept this answer if it solved the problem you exposed here, and ask other question if you now have other problem, with a different traceback. Try to help you here in the comments will be of no help for others facing the same problems that you have now. – Guinther Kovalski Nov 30 '21 at 12:18

score 0 · Answer 2 · answered Feb 19 '22 at 15:43

The explanation of RoiPooligLayer said that Shape of inputs must be: [(batch_size, pooled_height, pooled_width, n_channels), for featur map and (batch_size, num_rois, 4)] for region of interest but in your work you did not add the batch_size dimension try with this:

model_cnn.trainable = True
x = model_cnn.layers[17].output
x = np.expand_dims(x, axis=0) 
x = RoiPoolingConv(7)([x, roi_input])
x = TimeDistributed(Flatten())(x)

MultiClass Object Detection and Classification using Fast R-CNN

2 Answers2