Tensorflow Object Detection model doesnt detect the images in a video

Question

I developed a TensorFlow model with one class (it had a loss of 0.03 and was trained on 680 labelled images.) I am trying to use this model to detect the object on every video frame. However, whenever I run my code, it detects something in the top left of the screen in the black border surrounding the video. I tried changing the model from one trained with mobile net to one with efficientdet D3, and the same issue persisted. I then tried changing my code to require a minisize and a higher score. I have also tried letting it make multiple detections and use the one with the highest score. However, with all of these conditions, it didn't detect anything that fulfilled the requirements. my code is as follows:

import os
import time
import tensorflow as tf
import cv2
import scipy
import math
import pandas as pd
import numpy as np
from PIL import Image
from object_detection.utils import label_map_util
from object_detection.utils import visualization_utils as viz_utils
from base64 import b64encode

#os.chdir("C:\\Users\\Ibrahim\\desktop")
PATH_TO_SAVED_MODEL = "C:/Users/Ibrahim/Desktop/fine_tuned_model/content/fine_tuned_model/saved_model"
# Load label map and obtain class names and ids
#label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
category_index=label_map_util.create_category_index_from_labelmap("C:\\Users\\Ibrahim\\Desktop\\customTF2-20221225T123609Z-001\\customTF2\\data\\label_map.pbtxt",use_display_name=True)

file = "CL_1_S0003.mp4"
video = cv2.VideoCapture(file)
ret,frame=video.read()

#getting the walls bbox
wall_bbox = cv2.selectROI(frame)
(x_wall,y_wall,x2_wall,y2_wall) = wall_bbox
print(wall_bbox)
num_cont_frames=0

#video = cv.VideoCapture(path)
ball_size = 0.22 #diameter of a regulation ball in meters
fps_cam = 10000 # Change this to the required fps of the video 
fps_vid =video.get(cv2.CAP_PROP_FPS)
fps_time= fps_vid / fps_cam 
#print(fps_time)


model = tf.saved_model.load(PATH_TO_SAVED_MODEL)
signature = list(model.signatures.values())[0]

# Initialize variable to track state of ball (in contact with wall or not)
in_contact = False

# Initialize variable to track whether in_contact has ever been True
in_contact_ever = False

# Initialize lists for inbound and outbound velocities
inbound_velocities = []
outbound_velocities = []

# Calculate time interval between frames in seconds
time_interval = 1 / fps_cam

scale  = []
x_list = []
y_list = []
x_def=[]
inbound_x = []
inbound_y = []
outbound_x = []
outbound_y = []
w1=[]
score_thresh = 0.8   # Minimum threshold for object detection
max_detections = 20


while True:
# Read frame from video
    ret, frame = video.read()
    if not ret:
        break

    # Add a batch dimension to the frame tensor
    frame_tensor = tf.expand_dims(frame, axis=0)

    # Get detections for image
    detections = signature(frame_tensor)  # Replace this with a call to your TensorFlow model's predict method
    scores = detections['detection_scores'][0, :max_detections].numpy()
    bboxes = detections['detection_boxes'][0, :max_detections].numpy()
    labels = detections['detection_classes'][0, :max_detections].numpy().astype(np.int64)
    labels = [category_index[n]['name'] for n in labels]
 

    # Initialize variables to keep track of the maximum score and corresponding bounding box
    max_score = 0
    selected_bbox = None

    # Loop through all bounding boxes
    for bbox, score in zip(bboxes, scores):
        # Check if the score is greater than the current maximum score
        if score > max_score:
            # Update the maximum score and corresponding bounding box
            max_score = score
            selected_bbox = bbox
    
    # Check if a bounding box was selected
    if selected_bbox is not None:
        # Extract bounding box coordinates
        (x, y, w, h) = selected_bbox
        
        # Filter out bounding boxes that are too small (smaller than a minimum size)
        if w >= 10 and h >= 10:
            # Draw bounding box on frame
            cv2.rectangle(frame, (int(x), int(y)), (int(x+w), int(y+h)), (0,255,0), 20, 1)
            cv2.imshow('Frame', frame)
            cv2.waitKey(1)
            x2=x+w
            y2=y+h

        
            # Calculate center point of bounding box
            x_center = (x + x2) / 2
            y_center = (y + y2) / 2
            
            # Append x and y center points to lists
            x_list.append(x_center)
            y_list.append(y_center)
            w1.append(w)

            # Calculate other variables and metrics using bbox
            scale.append(ball_size/h)  #meters per pixel.diameter in pixels or coordinate value / real diameter in m to give pixel per m for a scale factor  
            #x_list.append(x2) #list of x positions of right edge
            #y_list.append(y2) 
         
            if (x_center - w) < max(x2_wall, x_wall): #sometimes the bbox is the wrong way around
                # Set in_contact to True
                in_contact = True
                # Set in_contact_ever to True
                in_contact_ever = True
                # Increment counter
                num_cont_frames = num_cont_frames + 1
                x_defe = x2-x2_wall
                x_def.append(x_defe) 
            else: 
                in_contact = False 

            if in_contact == False and in_contact_ever==False:
                inbound_x.append(x_center) #list of x positions at center of ball
                inbound_y.append(y_center) #list of y positions at center of ball  
            
            if in_contact == False and in_contact_ever==True:
                outbound_x.append(x_center) #list of x positions of right edge
                outbound_y.append(x_center)
                print(outbound_x)
        else:
            cv2.putText(frame,'Error',(100,0),cv2.FONT_HERSHEY_SIMPLEX,1,(0,0,255),2)
        cv2.imshow('Tracking',frame)
        if cv2.waitKey(1) & 0XFF==27:
            break
    
cv2.destroyAllWindows()

scale_ave=scipy.stats.trim_mean(scale, 0.2) #trim_mean 20% either way to remove some extrainious results

x_diff=[]
y_diff=[]
x_len=len(x_list)-1 #minus 1 as python starts with 0 so we dont overflow

for i in range(x_len): 
    x_diff.append(x_list[i]-x_list[i+1]) #find x distance per frame


for i in range(x_len): 
    y_diff.append(y_list[i]-y_list[i+1])  #find y distance per frame


pyth_dist=[]
pyth_sub=[]
x2_len=len(x_diff)-1
x_speed=[]
y_speed=[]

for i in range(x2_len):
    x_speeds=x_diff[i]*scale_ave*fps_cam
    x_speed.append(x_speeds)
    y_speeds=y_diff[i]*scale_ave*fps_cam
    y_speed.append(y_speeds)
    pyth_sub=math.hypot(x_diff[i] , y_diff[i])
    pyth_dist.append(pyth_sub) #do pythagoras to find pixel distance per frame

realdist=[]
speed=[]
for i in range(x2_len):
    realdistcalc=(pyth_dist[i]*scale_ave)
    realdist.append(realdistcalc) # change from pixels to meters

for item in realdist:
    if item > 1:
        realdist.remove(item)

distlen=len(realdist)-1



for i in range(distlen):
    speedcalc=realdist[i]*fps_cam
    speed.append(speedcalc)

contact_time=num_cont_frames/fps_cam
print(contact_time)

if x_def:
    realxdef = min(x_def)*scale_ave
else:
    realxdef = 0

print(realxdef)

# Calculate inbound velocities
inbound_x_diff = []
inbound_y_diff = []
# Calculate inbound x- and y-velocities
inbound_x_velocities = []
inbound_y_velocities = []
inbound_len = len(inbound_x) - 1

# Calculate differences between consecutive x and y coordinates
for i in range(inbound_len):
    inbound_x_diff.append(inbound_x[i] - inbound_x[i + 1])
    inbound_y_diff.append(inbound_y[i] - inbound_y[i + 1])

# Calculate inbound velocities in meters per second
inbound_velocities = []
for i in range(inbound_len):
    inbound_x_velocity = inbound_x_diff[i] * scale_ave * fps_cam
    inbound_x_velocities.append(inbound_x_velocity)
    inbound_y_velocity = inbound_y_diff[i] * scale_ave * fps_cam
    inbound_y_velocities.append(inbound_y_velocity)
    inbound_velocity = math.hypot(inbound_x_diff[i], inbound_y_diff[i]) * scale_ave * fps_cam
    inbound_velocities.append(inbound_velocity)

# Calculate outbound velocities
outbound_x_diff = []
outbound_y_diff = []

outbound_len = len(outbound_x) - 1

# Calculate differences between consecutive x and y coordinates
for i in range(outbound_len):
    outbound_x_diff.append(outbound_x[i] - outbound_x[i + 1])
    outbound_y_diff.append(outbound_y[i] - outbound_y[i + 1])

# Calculate outbound velocities in meters per second
outbound_velocities = []
outbound_x_velocities = []
outbound_y_velocities = []

for i in range(outbound_len):
    outbound_x_velocity = outbound_x_diff[i] * scale_ave * fps_cam
    outbound_x_velocities.append(outbound_x_velocity)
    outbound_y_velocity = outbound_y_diff[i] * scale_ave * fps_cam
    outbound_y_velocities.append(outbound_y_velocity)
    outbound_velocity = math.hypot(outbound_x_diff[i], outbound_y_diff[i]) * scale_ave * fps_cam
    outbound_velocities.append(outbound_velocity)e

I expected a bounding box around the ball. I tried changing the model, increasing the number of maximum detections, adding a minimum size to the detections, and increasing the required score for detection.

Tensorflow Object Detection model doesnt detect the images in a video

0 Answers0