I am creating a Waste Segregation Robot. I am implementing a Delta Robot made from MG946R RC hobby Servo and a Raspberry Pi 4B+ 4GB(Headless via VNC Viewer). I am using the official power supply and a acrylic case with fan. The object detection is run using tflite, with the SSD MobileNet V2 FPNLite 320x320. I followed this tutorial to make to create my object detection code and train the model. The detect.py is mostly and exact copy from the tutorial except I included pickup() to move the delta arm and ReleaseServo() to stop the pwm to the servo(as well as some code to calculate fps). I did as much as I could to make the tittle specific to the problem.
How the code functions(or supposed to)/ My objective:
- A frame is captured, processed and object detection is run on it.
- The center of the object coordinates as well its ID is sent to the pickup() function.
- The end effector moves to the object, closes the gripper, end effector goes to a designated spot and opens the gripper.
- The process then repeats.
The problem:
Once a object is placed the end effector goes there, closes its gripper and drops the item at the designated spot. The issue is that the the end effector keeps going back to the object location(even though its not even there). The cycle repeats for several times till it stops. [To trouble shoot, when the pickup() function is called, the Target location received by the function and the current coordinates of the end effector is printed]. I am assuming that for some reason(maybe because of slow fps of about from 1 to 0.01), there is buffer, and older frames are used for detection and the results are sent to the pickup() function. So the arm keep moving back to location seen in the buffer of old frames?
The solution I attempted:
To fix this issue I tried to implement multi-threading. But then it seems to function even slower visually based on my perspective(but with fps of 0.68). The cv2.putText doesn't work for some reason and the pickup() function doesn't get called.
This is the git repositary for the all the codes I used, I only included the detect.py modules in this post as the problem seems to stem from here in my knowledge.
The initial detect.py:
# Based on https://github.com/tensorflow/examples/blob/master/lite/examples/object_detection/raspberry_pi/README.md
import re
import cv2
from tflite_runtime.interpreter import Interpreter
import numpy as np
from Delta_Arm_Controller import pickup,ReleaseServo
CAMERA_WIDTH = 640
CAMERA_HEIGHT = 480
def load_labels(path='labels.txt'):
"""Loads the labels file. Supports files with or without index numbers."""
with open(path, 'r', encoding='utf-8') as f:
lines = f.readlines()
labels = {}
for row_number, content in enumerate(lines):
pair = re.split(r'[:\s]+', content.strip(), maxsplit=1)
if len(pair) == 2 and pair[0].strip().isdigit():
labels[int(pair[0])] = pair[1].strip()
else:
labels[row_number] = pair[0].strip()
return labels
def set_input_tensor(interpreter, image):
"""Sets the input tensor."""
tensor_index = interpreter.get_input_details()[0]['index']
input_tensor = interpreter.tensor(tensor_index)()[0]
input_tensor[:, :] = np.expand_dims((image-255)/255, axis=0)
def get_output_tensor(interpreter, index):
"""Returns the output tensor at the given index."""
output_details = interpreter.get_output_details()[index]
tensor = np.squeeze(interpreter.get_tensor(output_details['index']))
return tensor
def detect_objects(interpreter, image, threshold):
"""Returns a list of detection results, each a dictionary of object info."""
set_input_tensor(interpreter, image)
interpreter.invoke()
# Get all output details
boxes = get_output_tensor(interpreter, 1)
classes = get_output_tensor(interpreter, 3)
scores = get_output_tensor(interpreter, 0)
count = int(get_output_tensor(interpreter, 2))
results = []
for i in range(count):
if scores[i] >= threshold:
result = {
'bounding_box': boxes[i],
'class_id': classes[i],
'score': scores[i]
}
results.append(result)
return results
def main():
# Initialize frame rate calculation
frame_rate_calc = 1
freq = cv2.getTickFrequency()
labels = load_labels()
# Start timer (for calculating frame rate)
t1 = cv2.getTickCount()
# var to save the last position
prev_xmid = 0
prev_ymid = 0
interpreter = Interpreter('detect.tflite')
interpreter.allocate_tensors()
_, input_height, input_width, _ = interpreter.get_input_details()[0]['shape']
cap = cv2.VideoCapture(0)
while cap.isOpened():
ret, frame = cap.read()
img = cv2.resize(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), (320,320))
res = detect_objects(interpreter, img, 0.8)
print(len(res))
if len(res) > 0:
print(res[0]['bounding_box'])
print("entered loop")
ymin, xmin, ymax, xmax = res[0]['bounding_box']
xmid = (xmin + xmax) / 2
ymid = (ymin + ymax) / 2
prev_xmid, prev_ymid = pickup(xmid,ymid,prev_xmid,prev_ymid,res[0]['class_id'])
#print(f'X={xmid} Y={ymid}')
for result in res:
ymin, xmin, ymax, xmax = result['bounding_box']
xmin = int(max(1,xmin * CAMERA_WIDTH))
xmax = int(min(CAMERA_WIDTH, xmax * CAMERA_WIDTH))
ymin = int(max(1, ymin * CAMERA_HEIGHT))
ymax = int(min(CAMERA_HEIGHT, ymax * CAMERA_HEIGHT))
cv2.rectangle(frame,(xmin, ymin),(xmax, ymax),(0,255,0),3)
cv2.putText(frame,labels[int(result['class_id'])],(xmin, min(ymax, CAMERA_HEIGHT-20)), cv2.FONT_HERSHEY_SIMPLEX, 0.5,(255,255,255),2,cv2.LINE_AA)
# Draw framerate in corner of frame
cv2.putText(frame,'FPS: {0:.2f}'.format(frame_rate_calc),(30,50),cv2.FONT_HERSHEY_SIMPLEX,1,(255,255,0),2,cv2.LINE_AA)
cv2.imshow('Pi Feed', frame)
# Calculate framerate
t2 = cv2.getTickCount()
time1 = (t2-t1)/freq
frame_rate_calc= 1/time1
if cv2.waitKey(10) & 0xFF ==ord('q'):
ReleaseServo()
cap.release()
cv2.destroyAllWindows()
if __name__ == "__main__":
main()
The detect2.py code which uses multithreading:
# Based on https://github.com/tensorflow/examples/blob/master/lite/examples/object_detection/raspberry_pi/README.md
import re
import cv2
from tflite_runtime.interpreter import Interpreter
import numpy as np
from Delta_Arm_Controller import pickup,ReleaseServo,sleep
import threading
CAMERA_WIDTH = 640
CAMERA_HEIGHT = 480
# Create a lock for synchronization
lock = threading.Lock()
# Variable to indicate if the threads should continue running
running = True
labels = None
frame = None
res = None
cap = None
capture_stat = False
# Initialize the frame buffer
frame_buffer = None
def load_labels(path='labels.txt'):
"""Loads the labels file. Supports files with or without index numbers."""
with open(path, 'r', encoding='utf-8') as f:
lines = f.readlines()
labels = {}
for row_number, content in enumerate(lines):
pair = re.split(r'[:\s]+', content.strip(), maxsplit=1)
if len(pair) == 2 and pair[0].strip().isdigit():
labels[int(pair[0])] = pair[1].strip()
else:
labels[row_number] = pair[0].strip()
return labels
def set_input_tensor(interpreter, image):
"""Sets the input tensor."""
tensor_index = interpreter.get_input_details()[0]['index']
input_tensor = interpreter.tensor(tensor_index)()[0]
input_tensor[:, :] = np.expand_dims((image-255)/255, axis=0)
def get_output_tensor(interpreter, index):
"""Returns the output tensor at the given index."""
output_details = interpreter.get_output_details()[index]
tensor = np.squeeze(interpreter.get_tensor(output_details['index']))
return tensor
def detect_objects(interpreter, image, threshold):
"""Returns a list of detection results, each a dictionary of object info."""
set_input_tensor(interpreter, image)
interpreter.invoke()
# Get all output details
boxes = get_output_tensor(interpreter, 1)
classes = get_output_tensor(interpreter, 3)
scores = get_output_tensor(interpreter, 0)
count = int(get_output_tensor(interpreter, 2))
results = []
for i in range(count):
if scores[i] >= threshold:
result = {
'bounding_box': boxes[i],
'class_id': classes[i],
'score': scores[i]
}
results.append(result)
return results
#function to capture the frames
def capture_frame():
global cap
global frame_buffer
global running
global frame_buffer2
global capture_stat
try:
cap = cv2.VideoCapture(0)
while cap.isOpened() and running:
ret, frame = cap.read()
if ret:
with lock:
frame_buffer = frame.copy()
capture_stat = True
else:
break
except Exception as e:
print(f"Error in capture_frame: {e}")
finally:
if cap is not None:
cap.release()
#function to perform object detection
def perf_obj_det():
global frame_buffer
global running
interpreter = Interpreter('detect.tflite')
interpreter.allocate_tensors()
_, input_height, input_width, _ = interpreter.get_input_details()[0]['shape']
#print(type(frame_buffer))
while running and frame_buffer is not None:
with lock:
img = cv2.resize(cv2.cvtColor(frame_buffer, cv2.COLOR_BGR2RGB), (320,320))
res = detect_objects(interpreter, img, 0.8)
#function to operate the Delta Arm
def robot_arm():
global res
global running
# var to save the last position
prev_xmid = 0
prev_ymid = 0
while running:
with lock:
if res is not None and len(res) > 0:
#print(res[0]['bounding_box'])
ymin, xmin, ymax, xmax = res[0]['bounding_box']
xmid = (xmin + xmax) / 2
ymid = (ymin + ymax) / 2
prev_xmid, prev_ymid = pickup(xmid,ymid,prev_xmid,prev_ymid,res[0]['class_id'])
#function to display the frames with overlays
def display_feed():
global res
global labels
global running
global frame_buffer
frame_count = 0
frame_rate = 0.00
start_time = cv2.getTickCount()
#print(type(frame_buffer))
while running :
with lock:
if frame_buffer is not None:
frame_to_display = frame_buffer.copy()
else:
frame_to_display = np.zeros((CAMERA_HEIGHT, CAMERA_WIDTH, 3), dtype=np.uint8)
if res is not None:
print("there is a result")
for result in res:
ymin, xmin, ymax, xmax = result['bounding_box']
xmin = int(max(1,xmin * CAMERA_WIDTH))
xmax = int(min(CAMERA_WIDTH, xmax * CAMERA_WIDTH))
ymin = int(max(1, ymin * CAMERA_HEIGHT))
ymax = int(min(CAMERA_HEIGHT, ymax * CAMERA_HEIGHT))
cv2.rectangle(frame_to_display,(xmin, ymin),(xmax, ymax),(0,255,0),3)
print("box printing")
cv2.putText(frame_to_display,labels[int(result['class_id'])],(xmin, min(ymax, CAMERA_HEIGHT-20)), cv2.FONT_HERSHEY_SIMPLEX, 0.5,(255,255,255),2,cv2.LINE_AA)
print("item name printing")
# Display the frame with visualizations
cv2.imshow('Pi Feed', frame_to_display)
# Calculate and display the FPS
frame_count += 1
if frame_count >= 10: # Calculate FPS every 10 frames
end_time = cv2.getTickCount()
time_diff = (end_time - start_time) / cv2.getTickFrequency()
frame_rate = frame_count / time_diff
print(f'FPS: {frame_rate:.2f}')
# Draw framerate in corner of frame
cv2.putText(frame_to_display,'FPS: {0:.2f}'.format(frame_rate),(30,50),cv2.FONT_HERSHEY_SIMPLEX,1,(255,255,0),2,cv2.LINE_AA)
frame_count = 0
start_time = end_time
cv2.putText(frame_to_display,'FPS: {0:.2f}'.format(frame_rate),(30,50),cv2.FONT_HERSHEY_SIMPLEX,1,(255,255,0),2,cv2.LINE_AA)
key = cv2.waitKey(10) & 0xFF
if key == ord('q'):
running = False
break
def main():
try:
# Create and start the threads
capture_thread = threading.Thread(target=capture_frame)
detection_thread = threading.Thread(target=perf_obj_det)
robot_thread = threading.Thread(target=robot_arm)
display_thread = threading.Thread(target=display_feed)
#wait until first frame is captured and the frame_buffer is initialised and not read as NONE type
capture_thread.start()
while(capture_stat == False):
sleep(0.0001)
if capture_stat == True:
display_thread.start()
detection_thread.start()
robot_thread.start()
# Wait for threads to finish
capture_thread.join()
detection_thread.join()
display_thread.join()
robot_thread.join()
except Exception as e:
# Handle exceptions at the highest level (e.g., log the error)
print(f"Error in main: {e}")
finally:
global running
running = False # Set the running flag to False to stop all threads gracefully
if cap is not None:
cap.release() # Release the video capture resource
# Release the robot arm and resources (e.g., camera) when finished
ReleaseServo()
cv2.destroyAllWindows()
if __name__ == "__main__":
main()