0

I'm trying to modify my first Python program. I'm trying to use this repository to do some rudimentary text-to-speech. It does fine, but I want to improve it.

From the looks of it, there is a 0.145 second delay between samples played. Not all the samples of my voice will be 0.145 seconds, however, and I want to have each sample play one after the other with no delays or skips.

import re
import wave
import pyaudio
import _thread
import time

class TextToSpeech:

    CHUNK = 1024

    def __init__(self, words_pron_dict:str = 'cmudict-0.7b.txt'):
        self._l = {}
        self._load_words(words_pron_dict)

    def _load_words(self, words_pron_dict:str):
        with open(words_pron_dict, 'r') as file:
            for line in file:
                if not line.startswith(';;;'):
                    key, val = line.split('  ',2)
                    self._l[key] = re.findall(r"[A-Z]+",val)

    def get_pronunciation(self, str_input):
        list_pron = []
        for word in re.findall(r"[\w']+",str_input.upper()):
            if word in self._l:
                list_pron += self._l[word]
        print(list_pron)
        delay = 0.0
        for pron in list_pron:
            _thread.start_new_thread( TextToSpeech._play_audio, (pron,delay,))
            delay += 0.145

    def _play_audio(sound, delay):
        try:
            time.sleep(delay)
            wf = wave.open("sounds/"+sound+".wav", 'rb')
            p = pyaudio.PyAudio()
            stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
                channels=wf.getnchannels(),
                rate=wf.getframerate(),
                output=True)

            data = wf.readframes(TextToSpeech.CHUNK)

            while data:
                stream.write(data)
                data = wf.readframes(TextToSpeech.CHUNK)

            stream.stop_stream()
            stream.close()

            p.terminate()
        except:
            pass




if __name__ == '__main__':
    tts = TextToSpeech()
    while True:
        tts.get_pronunciation(input('Enter a word or phrase: '))

I've tried getting rid of the threading and delay, but there is some delay still between samples. I'm thinking that I should, instead of incrementing delay by 0.145, increment it by the length of the sample in seconds, but I've looked at the pyaudio documentation, and I have no idea how to do that.

Can someone help?

  • The length of sample is read out by `wave` module. PyAudio has no way to know the wav length. See getnframes method in wave module https://docs.python.org/3/library/wave.htmlhttps://docs.python.org/3/library/wave.html – dkato Dec 06 '17 at 23:16
  • Thanks dkato, but how would I apply this to my code to get each thread to be delayed only as much to play one after the other? I've tried defining self.delay in the class, and modifying that instead of delay, but it's not working. – Josiah Winslow Dec 07 '17 at 00:32

1 Answers1

1

Here is a modified code that plays wav files continuously.

import re
import wave
import pyaudio

class TextToSpeech:

    CHUNK = 1024

    def __init__(self, words_pron_dict='cmudict-0.7b.txt'):
        self._l = {}
        self._load_words(words_pron_dict)

    def _load_words(self, words_pron_dict: str):
        with open(words_pron_dict, 'r') as file:
            for line in file:
                if not line.startswith(';;;'):
                    key, val = line.split('  ', 2)
                    self._l[key] = re.findall(r"[A-Z]+", val)

    def get_pronunciation(self, str_input):
        list_pron = []
        for word in re.findall(r"[\w']+", str_input.upper()):
            if word in self._l:
                list_pron += self._l[word]
        print(list_pron)

        # pyaudio set up.
        # This open method assume all wave files have the same format.
        p = pyaudio.PyAudio()
        stream = p.open(format=p.get_format_from_width(2),
                        channels=2,
                        rate=44100,
                        output=True,
                        frames_per_buffer=self.CHUNK)

        # play each wav file contineuously
        for pron in list_pron:
            with wave.open("sounds/"+pron+".wav", 'rb') as wf:
                data = wf.readframes(TextToSpeech.CHUNK)
                while data:
                    stream.write(data)
                    data = wf.readframes(TextToSpeech.CHUNK)

        stream.stop_stream()
        stream.close()
        p.terminate()

if __name__ == '__main__':
    tts = TextToSpeech()
    while True:
        tts.get_pronunciation(input('Enter a word or phrase: '))
dkato
  • 895
  • 10
  • 28