how to make object detection with voice feedback where the voice is saying the object name just once without repeating

Question

I'm making an object detection using Opencv and yolo and voice feedback that can say any object seen by the camera using pyttsx3 library , the problem is that when there is an object in the frame like "person " it keep saying person without stopping , what i want is to say "person" just one time and when another object came into the frame like "bottle" it says pottle just one time. I'm struggling in this problem for many days now.

import cv2
import numpy as np
import pyttsx3

net = cv2.dnn.readNet('yolov3-tiny.weights', 'yolov3-tiny.cfg')

classes = []
with open("coco.names.txt", "r") as f:
   classes = f.read().splitlines()

cap = cv2.VideoCapture(0)
font = cv2.FONT_HERSHEY_PLAIN
colors = np.random.uniform(0, 255, size=(100, 3))

while True:
    _, img = cap.read()
    height, width, _ = img.shape

    blob = cv2.dnn.blobFromImage(img, 1/255, (416, 416), (0,0,0), swapRB=True, crop=False)
    net.setInput(blob)
    output_layers_names = net.getUnconnectedOutLayersNames()
    layerOutputs = net.forward(output_layers_names)

    boxes = []
    confidences = []
    class_ids = []

    for output in layerOutputs:
        for detection in output:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.2:
                center_x = int(detection[0]*width)
                center_y = int(detection[1]*height)
                w = int(detection[2]*width)
                h = int(detection[3]*height)

                x = int(center_x - w/2)
                y = int(center_y - h/2)

                boxes.append([x, y, w, h])
                confidences.append((float(confidence)))
                class_ids.append(class_id)

    indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.2, 0.4)

    if len(indexes)>0:
        for i in indexes.flatten():
            x, y, w, h = boxes[i]
            label = str(classes[class_ids[i]])
            confidence = str(round(confidences[i],2))
            color = colors[i]
            cv2.rectangle(img, (x,y), (x+w, y+h), color, 2)
            cv2.putText(img, label + " " + confidence, (x, y+20), font, 2, (255,255,255), 2)
            engine = pyttsx3.init()
            engine.say(label)
            engine.runAndWait()

    cv2.imshow('Image', img)
    key = cv2.waitKey(1)
    if key==27:
        break
cap.release()
cv2.destroyAllWindows()

One way would be to keep track of the objects seen last frame, and don't say any objects which appear in the last frame. Instead of saying each object, you would be saying only new objects. — Nick ODell, Oct 30 '22 at 19:44
So how to catch just the new object in the last frame in variable — Aya Mohammed, Oct 30 '22 at 20:59
You could define a variable `prev_frame_labels = [str(classes[class_ids[i]]) for i in indexes.flatten()]`, after the `cv2.imshow('Image', img)` line, and use the `in` operator to check whether a label is present in this list. — Nick ODell, Oct 30 '22 at 23:07

how to make object detection with voice feedback where the voice is saying the object name just once without repeating

0 Answers0