1

Is there a way to pause and resume speech recognition in webkitSpeechRecognition when audio is played by the computer? The computer right now seems to be confusing what is user input via the microphone and audio output from a wav file.

Right now I have created the below:

var speechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition; 
var recognition = new webkitSpeechRecognition();
    
window.addEventListener('DOMContentLoaded', function() {
    document.getElementById("speak_button").addEventListener('click', function() {
       recognition.start();
       setInterval(updateCountDown,1000); /* countdown timer starts 1 second after 
                                           being clicked */
       updateCountDown(); //this is a function that counts down from 2 minutes to 0
        
    });
});


var transcript; // transcript variable will store what the user says to the computer

recognition.addEventListener('result', e => {
    transcript = Array.from(e.results)
       .map(result => result[0])
       .map(result => result.transcript)
       .join('');
   console.log(transcript);
    communicateToUser();
  
});


function communicateToUser() {

    var audio_age = new Audio("age_20.wav");

        var age_regular_expression = /(?=.*\bhow\b)(?=.*\bold\b)(?=.*\byou\b)|(?=.*\bgrab\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bwhat\b)(?=.*\byour\b)(?=.*\bage\b)| (?=.*\btell\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bshare\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bshare\b)(?=.*\bhow\b)(?=.*\bold\b)|(?=.*\byou\b)(?=.*\bhow\b)(?=.*\bold\b)/gi;

    // if regular expression matches all words, then function will be performed

        if (age_regular_expression.test(transcript)) {
        recognition.stop(); /* wanting the speech recognition to stop here so that it 
                            doesn't capture the contents of audio_age */
        audio_age.play(); // audio will play "I am 20 years old" 
        recognition.start(); /* wanting the speech recognition to start again 
                            after audio_age is played */
        
    }

}

The problem is that the recognition.stop() function isn't working, which means that the microphone will continue capturing the contents of audio_age.wav and will convert it to text. So, when I want to speak to the computer again and ask it a question, the transcript that will be analysed will include the transcript from when I just spoke before.

Any advice would be appreciated.

I was thinking of a solution but I'm not sure how to implement it:
SOLUTION: stop the recognition function and delay it by the same number of seconds that the audio file plays for (for example 5 seconds), and then the recognition function can be started again after those 5 seconds?

Thanks!

EDIT FOR CESARE:

// SPEECH RECOGNITION SET UP 

    var speechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition; 
    var recognition = new webkitSpeechRecognition();
        

    window.addEventListener('DOMContentLoaded', function() {
    document.getElementById("speak_button").addEventListener('click', function() {
            recognition.start();
            setInterval(updateCountDown,1000);
            updateCountDown();
        });
    });

// ALL OF THE AUDIO FILES --> WILL BE PLAYED IF REGEX MATCHES TRUE
    
    const audio_name = new Audio("name_harry.wav");
    
    const audio_age = new Audio("age_20.wav");
    
    const audio_date_of_birth = new Audio("15_nov_1999.wav");
    
    const audio_occupation = new Audio("grocery_store.wav");


// ON SPEECH START --> IF MICROPHONE INPUT IS DETECTED, THEN SPEECH RECOGNITION STARTS 
    
    recognition.onspeechstart = () => {
        console.log("SPEECH STARTED");
        if (!audio_age.paused) {audio_age.pause()}
        else if (!audio_name.paused) {audio_name.pause()}
        else if (!audio_date_of_birth.paused) {audio_date_of_birth.pause()}
        else if (!audio_occupation.paused) {audio_occupation.pause()}
  
    };
    
// ON SPEECH END --> WHEN MICROPHONE INPUT STOPS, SPEECH RECOGNITION SHOULD END 

    recognition.onspeechend = () => {
        console.log("SPEECH ENDED");
        recognition.stop();
    
    };
    
// I have included this because I want the computer to continue listening to the user, but only after the audio is finished playing 

    recognition.addEventListener('end', recognition.start);

// After audio is ended, speech recognition will start again
    
    audio_name.addEventListener('ended', recognition.start);
    audio_age.addEventListener('ended', recognition.start);
    audio_date_of_birth.addEventListener('ended', recognition.start);
    audio_occupation.addEventListener('ended', recognition.start);
    audio_height.addEventListener('ended', recognition.start);
    
    
// USED TO OBTAIN THE USER TRANSCRIPT/ACTUAL SPEECH CONTENT

    var transcript;
    
    recognition.addEventListener('result', e => {
        transcript = Array.from(e.results)
           .map((result) => result[0])
           .map((result) => result.transcript)
           .join('');
       console.log(transcript);
       communicateToUser();
      
    });
    
 




     // ALL OF THE REGULAR EXPRESSIONS

    const name_regex = /what is your name|(?=.*\byour\b)(?=.*\bfull\b)(?=.*\bname\b)|(?=.*\btell\b)(?=.*\bme\b)(?=.*\byour\b)(?=.*\bname\b)|(?=.*\bcan\b)(?=.*\btell\b)(?=.*\bme\b)(?=.*\byour\b)(?=.*\bname\b)|(?=.*\btell\b)(?=.*\bme\b)(?=.*\byour\b)(?=.*\bfull\b)(?=.*\bname\b)|(?=.*\blet\b)(?=.*\bknow\b)(?=.*\bfull\b)(?=.*\bname\b)|(?=.*\bgrab\b)(?=.*\byour\b)(?=.*\bname\b)|(?=.*\bwhat\b)(?=.*\byour\b)(?=.*\bname\b)|(?=.*\bshare\b)(?=.*\bme\b)(?=.*\bfull\b)(?=.*\bname\b)|(?=.*\bwhat\b)(?=.*\byour\b)(?=.*\bfirst\b)(?=.*\band\b)(?=.*\blast\b)(?=.*\bname\b)/ig;

const age_regex = /(?=.*\bhow\b)(?=.*\bold\b)(?=.*\byou\b)|(?=.*\bgrab\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bwhat\b)(?=.*\byour\b)(?=.*\bage\b)| (?=.*\btell\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bshare\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bshare\b)(?=.*\bhow\b)(?=.*\bold\b)|(?=.*\byou\b)(?=.*\bhow\b)(?=.*\bold\b)/gi;

const date_of_birth_regex = /(?=.*\bdate\b)(?=.*\bof\b)(?=.*\bbirth\b)|(?=.*\byour\b)(?=.*\bdate\b)(?=.*\bof\b)(?=.*\bbirth\b)|(?=.*\bshare\b)(?=.*\byour\b)(?=.*\bdate\b)(?=.*\bof\b)(?=.*\bbirth\b)|(?=.*\bshare\b)(?=.*\bdate\b)(?=.*\bof\b)(?=.*\bbirth\b)|(?=.*\bwhen\b)(?=.*\byou\b)(?=.*\bborn\b)|(?=.*\bwhen\b)(?=.*\byou\b)(?=.*\bborn\b)|(?=.*\bwhat\b)(?=.*\bdate\b)(?=.*\byou\b)(?=.*\bborn\b)/gi

const patient_occupation = /do you have a job|(?=.*\bdo\b)(?=.*\byou\b)(?=.*\bwork\b)|(?=.*\byou\b)(?=.*\bhave\b)(?=.*\bjob\b)|(?=.*\byou\b)(?=.*\bwork\b)(?=.*\bwhere\b)|(?=.*\banything\b)(?=.*\bfor\b)(?=.*\bwork\b)|(?=.*\byou\b)(?=.*\bwork\b)(?=.*\banywhere\b)|(?=.*\bwhat\b)(?=.*\boccupation\b)|(?=.*\byour\b)(?=.*\boccupation\b)|(?=.*\byou\b)(?=.*\boccupation\b)|(?=.*\byour\b)(?=.*\bjob\b)|(?=.*\bwhat\b)(?=.*\byour\b)(?=.*\bjob\b)|(?=.*\byou\b)(?=.*\bjob\b)|(?=.*\bjob\b)/ig;

// COMMUNICATE BACK TO USER FUNCTION
 

       function communicateToUser() {
    
    if (name_regex.test(transcript)) {
            audio_name.play();
    }

    if (age_regex.test(transcript)) {
            audio_age.play();
    }
    if (date_of_birth_regex.test(transcript)) {
                audio_date_of_birth.play();
    }
    if (occuptation_regex.test(transcript)) {
                    audio_occupation.play();
        }
    
    }
         

UpdateCountdown function

function updateCountDown() {
   
   const minutes = Math.floor(time / 60);
   let seconds = time % 60;

   seconds = seconds < 2 ? '0' + seconds : seconds;

   document.getElementById("countdown").innerHTML = `${minutes}:${seconds}`;

   time--;

   time = time < 0 ? 0 : time; 

    if (minutes == 0 && seconds == 0) {
        document.getElementById('tableStyle').style.display = "block";
        recognition.stop(); //ADDING IN RECOGNITION.STOP ONCE MINUTES AND SECONDS == 0!
        
    }

   };
sperfume
  • 31
  • 5

1 Answers1

0

EDIT:

I made a working example, https://stackblitz.com/edit/web-platform-ppcuh9?file=index.html:

let isListening = false; // use this flag to toggle the recognition
let interval;
const button = document.getElementById('speak_button');

const speaker = new MakeSpeechSynth({
  pitch: 0.5,
  rate: 0.8,
  language: 'en-US',
});

const SpeechRecognition =
  window.SpeechRecognition || window.webkitSpeechRecognition;
const recognition = new SpeechRecognition();

button.addEventListener('click', function() {
  if (isListening) {
    console.log('ABORTING RECOGNITION');
    isListening = false;
    recognition.abort();
    clearInterval(interval);
    button.innerText = 'Click Me To Speak';
  } else {
    console.log('STARTING RECOGNITION');
    recognition.start();
    interval = setInterval(updateCountDown, 1000);
    updateCountDown();
    button.innerText = 'Stop Recognition';
    isListening = true;
  }
});

recognition.onaudiostart = () => {
  console.log('RECOGNITION STARTED');
};

recognition.onaudioend = () => {
  console.log('RECOGNITION FINISHED');
};

recognition.onend = () => {
  console.log('RECOGNITION DISCONNECTED');
  if (isListening) recognition.start();
};

recognition.onspeechstart = () => {
  console.log('SPEECH STARTED');
  // You can stop the bot speaking if you want when you speak over him:
  // Comment if you want him to keep speaking

  //Object.values(data).forEach((d) => d.audio.pause());
  if (speaker.isSpeaking) speaker.cancel();
};

recognition.onspeechend = () => {
  console.log('SPEECH ENDED');
};

recognition.addEventListener('result', (e) => {
  const transcript = Array.from(e.results)
    .map((result) => result[0])
    .map((result) => result.transcript)
    .join('');
  console.log(transcript);
  speakBackToMe(transcript);
});

function speakBackToMe(str) {
  Object.values(data).forEach((d) => {
    if (d.regex.test(str)) {
      // d.audio.play();
      speaker.speak(d.message);
      console.log(d.message);
    }
  });
}

// UPDATE COUNTDOWN
const startingMinutes = 2;
let time = startingMinutes * 60;

function updateCountDown() {
  const minutes = Math.floor(time / 60);
  let seconds = time % 60;
  seconds = seconds < 2 ? '0' + seconds : seconds;
  document.getElementById('countdown').innerHTML = `${minutes}:${seconds}`;
  time--;
  time = time < 0 ? 0 : time;

  if (minutes == 0 && seconds == 0) {
    document.getElementById('tableStyle').style.display = 'table-cell';
  }
}
<div id="app"></div>
<button id="speak_button">Click Me to Speak</button>
<p id="countdown"></p>
Cesare Polonara
  • 3,473
  • 1
  • 9
  • 19
  • Hey! Thanks so much for your help. I seem to be getting the following error message though: `Uncaught DOMException: Failed to execute 'start' on 'SpeechRecognition': recognition has already started. at Audio. (http://127.0.0.1:5500/script1.js:189:60)`. Any thoughts? – sperfume Apr 17 '22 at 07:09
  • I coudln't test the code throughly, I tried to make a working example now, check if that's what you need. – Cesare Polonara Apr 17 '22 at 10:29
  • Hey @Cesare Polonara - this looks really good. I've implemented a lot of your suggestions to fit my code, and thankfully that error I mentioned before isn't coming anymore. But it still seems like the computer doesn't stop recognition after I stop talking and only when the audio finishes (perhaps because the audio being played is in my voice so the computer can't distinguish between microphone input and audio output?). I have made an edit in my original post with some new code - could you see where I could improve on it to achieve what I'm after? Thanks so much for all your help!!!! – sperfume Apr 18 '22 at 00:55
  • I think you are messing with if/else if in onspeechstart, you should stop all audio files when you start to speak, with if/else if you stop just one audio file. You should not have noise or sound while you are speaking, or of course you will disturb the speech recognitizion service :) – Cesare Polonara Apr 18 '22 at 01:31
  • That makes sense @Cesare Polonara!! :) What are your thoughts on (1) creating – sperfume Apr 18 '22 at 05:03
  • Hey @Cesare Polonara - would you be able to point me in the right direction of how I might be able to `delay` `start speech recognition` by 500ms or 1000ms after the audio output plays? I think that might ensure that the microphone won't pick up the audio output and only the microphone (my voice) input? Thanks! – sperfume Apr 18 '22 at 05:08
  • If you could fork that stackblitz and place your trials there it would be easier for me to show you! – Cesare Polonara Apr 18 '22 at 11:02
  • here you go :) Link: https://codepen.io/sperfume/pen/RwxEZbq! If you would like to test the code to see what I mean, ask the computer the following questions "What is your name?", "What is your age?", "do you work?" and "What is your date of birth" and what will happen is that the audio output will be picked up by the microphone! (actually I just realised there won't be audio in the codepen!). Let me know if this works for you! – sperfume Apr 18 '22 at 12:15
  • Okay, I made some experiments, and instead of using audio to play files, I put in a utility I wrote sometime ago, that makes use of Speech Synth to speak messages. I tried to tidy up code a bit, and used several recognition listeners to increase the control over the recognition life cycle. If I understood correctly You wished to have an infinite recognition service with a sort of conversational behaviour. I think this could work, check the edit. – Cesare Polonara Apr 18 '22 at 14:31
  • Hey @Cesare Polonara - it still seems like when I try your code, the recognition still continues whilst the speech synthesis is active. I have an else statement in the speech recognition function where if the regular expressions are not matched for any of the conditions, the bot will speak "I'm not sure". And so, the program seems to be in a constant loop of "I'm not sure", "I'm not sure", "I'm not sure", because I didn't use "I'm not sure" as one of the regex so the else statement is true :/ I'll have another crack at trying to resolve this issue and then get back to you in a bit!! :) – sperfume Apr 20 '22 at 00:39
  • In that speech synth class i made, if you notice there are two optional callbacks you may pass, `onStart` and `onEnd`. I think you could just stop and start the recognition in those callbacks if that's a problem for you. On my pc the speech synt doesn't disturb the recognition, but if it does for you, check if this works : https://stackblitz.com/edit/web-platform-ppcuh9?file=script.js – Cesare Polonara Apr 20 '22 at 00:53
  • When I add in the following else statement into the function `function speakBackToMe(str) { Object.values(data).forEach((d) => { if (d.regex.test(str)) { speaker.speak(d.message); console.log(d.message); } else { speaker.speak("I'm not sure"); } }); }` The computer seems to repeat "I'm not sure" on loop for me! The reason why I'm adding in the else statement is just in case the user (me) asks a question (like "who is the president?") that doesn't match any of the regular expressions, so I want to be able to give the user some sort of response! – sperfume Apr 20 '22 at 02:00
  • Because you are putting that inside a forEach loop. You can place the speaker.speak() inside the loop based on the fact that you are sure that you can have one and only one match, if you could have more than one magch, you should place the `speaker.speak()` outside the loop after you pass the match you prefer to a flag variable. Check: https://stackblitz.com/edit/web-platform-ppcuh9?file=script.js – Cesare Polonara Apr 20 '22 at 02:40
  • Hey @Cesare Polonara - sorry to bother you again, but I was wondering if there was a way to end speech recognition entirely once my updateCountDown function is finished (ie. once minutes and seconds == 0). I've edited my question with the function and added in my trial (recognition.stop();) if you wanted to take a look at it! Thanks again – sperfume Apr 21 '22 at 10:08
  • Just call recognition.abort() and change isListening to false. – Cesare Polonara Apr 21 '22 at 11:59
  • hey @Cesare Polonara - i was exploring this a bit more and I was wondering if you knew how I could change the voice of the speech synthesis from a male one to a female one (eg. the microsoft Zira voice)? Would it involve the statement `this.voice = this.voices.filter((el) => el.lang === this.language)[0]` and changing the 0 to a different number? Thanks! – sperfume Apr 25 '22 at 02:10
  • Hey @Cesare Polonara - don't worry I figured it out!!! In the speak(text, delay) function, I just assigned this.utterance.voice = window.speechSynthesis.getVoices()[number], where number corresponds to where the voice is in the array! :) – sperfume Apr 25 '22 at 23:36
  • Yes right way to do it, good job :) – Cesare Polonara Apr 25 '22 at 23:38