trying to convert audio to text using DeepSpeech, it works fine with the default audio files from Mozilla/DeepSpeech. but when i try to record audio from my PC's microphone and feed it to the model, it raises an error( 'wave.Error: unknown format: 3'). I am using sounddevice library to record audio. I tried to change sample rate and channels number but didn't work. I know that the error is caused by the 'wave' function, but i couldn't figure it out. Please Help
My code:
from deepspeech import Model
import numpy as np
import os
import wave
from playsound import playsound
import sounddevice as sd
from scipy.io.wavfile import write
model_file_path = 'deepspeech-0.8.2-models.pbmm'
lm_file_path = 'deepspeech-0.9.3-models.scorer'
beam_width = 500
lm_alpha = 0.93
lm_beta = 1.18
model = Model(model_file_path)
model.enableExternalScorer(lm_file_path)
model.setScorerAlphaBeta(lm_alpha, lm_beta)
model.setBeamWidth(beam_width)
def read_wav_file(filename):
with wave.open(filename, 'rb') as w:
rate = w.getframerate()
frames = w.getnframes()
buffer = w.readframes(frames)
print("Rate:", rate)
print("Frames:", frames)
print("Buffer Len:", len(buffer))
return buffer, rate
def transcribe_batch(audio_file):
buffer, rate = read_wav_file(audio_file)
data16 = np.frombuffer(buffer, dtype=np.int16)
return model.stt(data16)
fs = 48000 # Sample rate
seconds = 5 # Duration of recording
myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=1)
sd.wait() # Wait until recording is finished
write('output.wav', fs, myrecording) # Save as WAV file
playsound('output.wav')
trans = transcribe_batch('output.wav')
print("trancribed message: ", trans)
with open('subtitle.txt', 'w') as f:
f.write(trans)
print("executed successfully")
Error:
TensorFlow: v2.3.0-6-g23ad988fcd
DeepSpeech: v0.9.3-0-gf2e9c858
2021-11-14 00:57:24.539394: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Traceback (most recent call last):
File "C:/Users/Lenovo/PycharmProjects/deepspeech-wenv/test.py", line 48, in <module>
trans = transcribe_batch('output.wav')
File "C:/Users/Lenovo/PycharmProjects/deepspeech-wenv/test.py", line 35, in transcribe_batch
buffer, rate = read_wav_file(audio_file)
File "C:/Users/Lenovo/PycharmProjects/deepspeech-wenv/test.py", line 23, in read_wav_file
with wave.open(filename, 'rb') as w:
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python36\lib\wave.py", line 499, in open
return Wave_read(f)
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python36\lib\wave.py", line 163, in __init__
self.initfp(f)
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python36\lib\wave.py", line 143, in initfp
self._read_fmt_chunk(chunk)
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python36\lib\wave.py", line 260, in _read_fmt_chunk
raise Error('unknown format: %r' % (wFormatTag,))
wave.Error: unknown format: 3