1

I have a code that detects objects and transcribes them to speech. However, if various objects are detected in the same frame the voice output gets messed up and starts saying the object's name all together making no sense to the user. I'm developing this application for the visually impaired so I'm trying to focus on getting an object's name transcribing it to the user and then move on to the next object.

Here is the code below.

import UIKit
import AVKit
import Vision
import CoreML
import AVFoundation

class ViewController: UIViewController , AVCaptureVideoDataOutputSampleBufferDelegate {

    @IBOutlet weak var innerView: UIView!
    @IBOutlet weak var viewLable: UILabel!
    
    var previewLayer: AVCaptureVideoPreviewLayer?
    
    override func viewDidLoad() {
        super.viewDidLoad()

        updateLable(newLable: "new lable")
        
        //Start the Camera
        let captureSession = AVCaptureSession()
        captureSession.sessionPreset = .photo
    
        // get back camera as Video Capture Device
        guard let captureDevice = AVCaptureDevice.default(for: .video)
            else { self.quickErr(myLine: #line,inputStr: "") ; return }
        
        try? captureDevice.lockForConfiguration()
            captureDevice.activeVideoMinFrameDuration =  CMTimeMake(1, 2)
            captureDevice.activeVideoMaxFrameDuration =  CMTimeMake(1, 2)
            captureDevice.unlockForConfiguration()
        
        guard let input = try? AVCaptureDeviceInput(device: captureDevice)
            else { self.quickErr(myLine: #line,inputStr: "") ; return }
        captureSession.addInput(input)
        captureSession.startRunning()
        
        self.previewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
        self.previewLayer?.frame.size = self.innerView.frame.size
        self.previewLayer?.videoGravity = AVLayerVideoGravity.resizeAspectFill
        self.innerView.layer.addSublayer(self.previewLayer!)
        self.previewLayer?.frame = view.frame
    
//        let label = UILabel(frame: CGRect(x: 0, y: 0, width: 200, height: 21))
//        label.center = CGPoint(x: 160, y: 285)
//        label.textAlignment = .center
//        label.text = "I'am a test label"
//        self.view.addSubview(label)
//        label.text = ""
        
        //get access to video frames
        let dataOutput = AVCaptureVideoDataOutput()
        dataOutput.setSampleBufferDelegate(self, queue: DispatchQueue(label: "VideoQueue"))
        captureSession.addOutput(dataOutput)
        
    }
    
    override func viewDidAppear(_ animated: Bool) {
        super.viewDidAppear(animated)
        self.previewLayer?.frame.size = self.innerView.frame.size
    }
    
    func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
        //print("Camera was able to capture a frame ", Date())
        
        guard let pixcelBuffer:CVPixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer)
            else { self.quickErr(myLine: #line,inputStr: "") ; return }
        guard let model =  try? VNCoreMLModel(for: Resnet50().model)
            else { self.quickErr(myLine: #line,inputStr: "") ; return }
        let request = VNCoreMLRequest(model: model) { (finishedReq, err) in
            //check err
            //print(finishedReq.results)
        guard let results = finishedReq.results as? [VNClassificationObservation]
            else { self.quickErr(myLine: #line,inputStr: "") ; return }
        guard let firstObservation = results.first
            else { self.quickErr(myLine: #line,inputStr: "") ; return }
        var myMessage = ""
        var myConfident = 0
            
            if (firstObservation.confidence > 0.2 ) {
                myConfident = Int ( firstObservation.confidence * 100 )
                let myIdentifier = firstObservation.identifier.split(separator: ",")
                myMessage = "I am \(myConfident) % confidence this object is : \(myIdentifier[0]) "
            } else {
                myMessage = "I am not confidence to detect this object"
            }
            
            print(myMessage)
            self.updateLable(newLable: myMessage)
            if ( myConfident >= 70 ){
                self.readyMe(myText: myMessage, myLang: "en_EN")
            }
        }

        // Anaylize image
        try? VNImageRequestHandler(cvPixelBuffer: pixcelBuffer, options: [:]).perform([request])
    }

    func readyMe(myText :String , myLang : String ) {
        let uttrace = AVSpeechUtterance(string: myText )
        uttrace.voice = AVSpeechSynthesisVoice(language: myLang)
        uttrace.rate = 0.5
        
        let synthesizer = AVSpeechSynthesizer()
        synthesizer.speak(uttrace)
    }
    
    
    func quickErr(myLine: Int , inputStr : String = "" ) {
        print("===> Guard Error \(inputStr) :\n    file:\(#file)\n    line:\(myLine)\n    function:\(#function) ")
    }
    
    func updateLable(newLable: String){
        
        DispatchQueue.main.async { // Correct
            self.viewLable?.text = "[ " + newLable + " ]"
        }
    }
    
}
desertnaut
  • 57,590
  • 26
  • 140
  • 166
tamerjar
  • 220
  • 2
  • 12

0 Answers0