I'm trying to make model (VGG-16) that uses Fast R-CNN for object detection. In short, I want to find object on image and put bounding box where object is.
I already tried multiple way's of getting that, but all the time I'm getting some error's, basically most of them are with RoiPoolingLayer and loss function's.
Can you guys guide what I'm doing wrong?
So let me introduce you:
This is my code atm:
import pickle
import numpy
import tensorflow
from keras import Input, Model
from keras.initializers.initializers_v1 import RandomNormal
from keras.layers import Flatten, TimeDistributed, Dense, Dropout
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.optimizers import Adam
from tensorflow.python.keras.regularizers import l2
from data import get_data, get_train_data
from rcnn.config import Config
import tensorflow as tf
from tensorflow.keras.layers import Layer
class RoiPoolingConv(Layer):
def __init__(self, pool_size, **kwargs):
self.pool_size = pool_size
super(RoiPoolingConv, self).__init__(**kwargs)
def build(self, input_shape):
self.nb_channels = input_shape[0][3]
super(RoiPoolingConv, self).build(input_shape)
def compute_output_shape(self, input_shape):
return None, None, self.pool_size, self.pool_size, self.nb_channels
def crop_and_resize(self, image, boxes):
box_ind = tf.range(tf.shape(boxes)[0])
box_ind = tf.reshape(box_ind, (-1, 1))
box_ind = tf.tile(box_ind, [1, tf.shape(boxes)[1]])
boxes = tf.keras.backend.cast(
tf.reshape(boxes, (-1, 4)), "float32"
)
box_ind = tf.reshape(box_ind, (1, -1))[0]
result = tf.image.crop_and_resize(image, boxes, box_ind, [self.pool_size, self.pool_size])
result = tf.reshape(result, (tf.shape(image)[0], -1, self.pool_size, self.pool_size, self.nb_channels))
return result
def call(self, x, mask=None):
assert (len(x) == 2)
img = x[0]
rois = x[1]
print(x)
print(img)
print(rois)
x1 = rois[:, 0]
y1 = rois[:, 1]
x2 = rois[:, 2]
y2 = rois[:, 3]
boxes = tf.stack([y1, x1, y2, x2], axis=-1)
print(boxes)
rs = self.crop_and_resize(img, boxes)
print(rs)
return rs
def get_config(self):
config = {'pool_size': self.pool_size}
base_config = super(RoiPoolingConv, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
PROPERTIES = Config()
def prepare_model(
model_path="model\\FastRCNN.h5"
):
roi_input = Input(shape=(None, 4), name="input_2")
model_cnn = tensorflow.keras.applications.VGG16(
include_top=True,
weights='imagenet'
)
model_cnn.trainable = True
x = model_cnn.layers[17].output
x = RoiPoolingConv(7)([x, roi_input])
x = TimeDistributed(Flatten())(x)
softmaxhead = Dense(4096, activation='relu', kernel_initializer=RandomNormal(stddev=0.01), kernel_regularizer=l2(0.0005), bias_regularizer=l2(0.0005))(x)
softmaxhead = Dropout(0.5)(softmaxhead)
softmaxhead = Dense(4096, activation='relu', kernel_initializer=RandomNormal(stddev=0.01), kernel_regularizer=l2(0.0005), bias_regularizer=l2(0.0005))(softmaxhead)
softmaxhead = Dropout(0.5)(softmaxhead)
softmaxhead = Dense(20, activation='softmax', kernel_initializer='zero', name='class_label')(softmaxhead)
bboxhead = Dense(128, activation='relu')(x)
bboxhead = Dense(64, activation='relu')(bboxhead)
bboxhead = Dense(32, activation='relu')(bboxhead)
bboxhead = Dense(4, activation='sigmoid', name='bounding_box')(bboxhead)
model_final = Model(inputs=[model_cnn.input, roi_input], outputs=(bboxhead, softmaxhead))
opt = Adam(learning_rate=0.0001)
losses = {
"class_label": PROPERTIES.CLASS_LABEL_LOSSES,
"bounding_box": PROPERTIES.BOUNDING_BOX_LOSSES
}
lossWeights = {
"class_label": PROPERTIES.LOSS_WEIGHTS,
"bounding_box": PROPERTIES.LOSS_WEIGHTS
}
model_final.compile(
loss=losses,
optimizer=opt,
metrics=["accuracy"],
loss_weights=lossWeights
)
tensorflow.keras.utils.plot_model(
model_final,
"model.png",
show_shapes=True,
show_layer_names=False,
rankdir='TB'
)
model_final.save(model_path)
return model_final
def train_RCNN_VGG(path):
# get voc data
all_data, classes_count, class_mapping = get_data(path)
tr_images, tr_labels_rois, tr_bboxes_rois, tr_bboxes_gt = get_train_data(all_data)
#val_images, val_labels, val_bboxes = get_validation_data(all_data)
# delete unnecessary data
del classes_count
del class_mapping
del all_data
# convert to numpy array
tr_images = numpy.array(tr_images, dtype="float32")
tr_bboxes_rois = numpy.array(tr_bboxes_rois, dtype="float32")
tr_bboxes_gt = numpy.array(tr_bboxes_gt, dtype="float32")
tr_labels_rois = numpy.array(tr_labels_rois)
print(tr_images.shape)
print(tr_bboxes_rois.shape)
print(tr_bboxes_gt.shape)
print(tr_labels_rois.shape)
# same for validation data
#val_images = numpy.array(val_images, dtype="float32")
#val_bboxes = numpy.array(val_bboxes, dtype="float32")
#val_labels = numpy.array(val_labels)
# use label binarizer for signing which class/label if for image
labelBinarizer = LabelBinarizer()
tr_labels_rois = labelBinarizer.fit_transform(tr_labels_rois)
#val_labels = labelBinarizer.fit_transform(val_labels)
classes = len(labelBinarizer.classes_)
# load model, provide number of classes
#model_vgg = load_model_or_construct(classes)
model_vgg = prepare_model()
# define a dictionary to set the loss methods
losses = {
"class_label": PROPERTIES.CLASS_LABEL_LOSSES,
"bounding_box": PROPERTIES.BOUNDING_BOX_LOSSES
}
# define a dictionary that specifies the weights per loss
lossWeights = {
"class_label": PROPERTIES.LOSS_WEIGHTS,
"bounding_box": PROPERTIES.LOSS_WEIGHTS
}
# initialize the optimizer, compile the model, and show the model
opt = Adam(learning_rate=PROPERTIES.LEARNING_RATE)
model_vgg.compile(loss=losses, optimizer=opt, metrics=["accuracy"], loss_weights=lossWeights)
# construct a dictionary for our target training outputs, for our target testing
trainTargets = {
"class_label": tr_labels_rois,
"bounding_box": tr_bboxes_gt
}
#validationTargets = {
# "class_label": val_labels,
# "bounding_box": val_bboxes
#}
# train the network for bounding box regression and class label
H = model_vgg.fit(
[tr_images, tr_bboxes_rois], trainTargets,
# validation_data=(val_images, validationTargets),
batch_size=PROPERTIES.BATCH_SIZE,
epochs=PROPERTIES.EPOCHS,
verbose=PROPERTIES.VERBOSE)
# save model, print summary
model_vgg.save(PROPERTIES.RCNN_MODEL_NAME, save_format=PROPERTIES.RCNN_MODEL_FORMAT)
model_vgg.summary()
# save binarizer
f = open(PROPERTIES.BINARIZER_NAME, "wb")
f.write(pickle.dumps(labelBinarizer))
f.close()
if __name__ == '__main__':
# load rcnn
train_RCNN_VGG(PROPERTIES.DATASET_PATH)
I'm creating RoiPooling Layer, VGG-16 architecture, loading pre-trained weights, making my own output layers, cause I have 20 classes (basing on VOC Data from 2012) that's why first output has 20, second has 4 - cause of bounding box's coordinates.
In train method, you can see I'm printing shape's of data I'm delivering, they are:
(1048, 224, 224, 3)
(1048, 4)
(1048, 4)
(1048,)
First one, it's 1048 images of 224x224 rgb Second, it's 1048 rois coordinates prepared for 224x224 Third, it's 1048 ground truth's bboxes Fourth, it's 1048 times 20 labels. Label's are like this: [[0, 0, 0, 0, 0, 0, ... 1, 0, 0,](19's zeros, and one 1 - correct label), [0, ....]]
I was basing on this: https://www.pyimagesearch.com/2020/10/12/multi-class-object-detection-and-bounding-box-regression-with-keras-tensorflow-and-deep-learning/
Currently I have this error:
Traceback (most recent call last):
File "C:\Users\Karol\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
File "C:\Users\Karol\anaconda3\lib\site-packages\tensorflow\python\framework\func_graph.py", line 1129, in autograph_handler
raise e.ag_error_metadata.to_exception(e)
ValueError: in user code:
File "C:\Users\Karol\anaconda3\lib\site-packages\keras\engine\training.py", line 878, in train_function *
return step_function(self, iterator)
File "C:\Users\Karol\anaconda3\lib\site-packages\keras\engine\training.py", line 867, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "C:\Users\Karol\anaconda3\lib\site-packages\keras\engine\training.py", line 860, in run_step **
outputs = model.train_step(data)
File "C:\Users\Karol\anaconda3\lib\site-packages\keras\engine\training.py", line 809, in train_step
loss = self.compiled_loss(
File "C:\Users\Karol\anaconda3\lib\site-packages\keras\engine\compile_utils.py", line 201, in __call__
loss_value = loss_obj(y_t, y_p, sample_weight=sw)
File "C:\Users\Karol\anaconda3\lib\site-packages\keras\losses.py", line 141, in __call__
losses = call_fn(y_true, y_pred)
File "C:\Users\Karol\anaconda3\lib\site-packages\keras\losses.py", line 245, in call **
return ag_fn(y_true, y_pred, **self._fn_kwargs)
File "C:\Users\Karol\anaconda3\lib\site-packages\keras\losses.py", line 1664, in categorical_crossentropy
return backend.categorical_crossentropy(
File "C:\Users\Karol\anaconda3\lib\site-packages\keras\backend.py", line 4994, in categorical_crossentropy
target.shape.assert_is_compatible_with(output.shape)
ValueError: Shapes (None, 20) and (None, None, 20) are incompatible
python-BaseException
So, my question is: What am I missing, is my preprocessing-data incorrect? I'm trying to teach my model recognition 20 classes and pointing where on image this object probably is. But I have to make wrong data delivering I guess.
Just to make something clear, I'm using categorical cross entropy and mean average precision for "class label" and "bounding boxes".
Maybe I'm just using wrong loss function's? Please help.