I am working with VOSK to recognise some speech. Until now I have just needed the final output and could get this from the below:
def text_from_audio_orig( path, file, lang, location):
os.chdir("D:/OneDrive/DataSci/Tennis/02_Preprocessing/Voice/VSOK/" + location)
from pydub import AudioSegment
# wf = AudioSegment.from_file(path + file)
wf = wave.open(path + file, "rb")
model = Model("model")
if lang == "English":
rec = KaldiRecognizer(model, wf.getframerate(),
'["fifteen","love", "mistake" ]')
results = []
while True:
data = wf.readframes(4000)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
results.append(json.loads(rec.Result())['text'])
results.append(json.loads(rec.FinalResult())['text'])
pprint.pprint(results)
return results
This results a list which I added as a column to a dataframe. Now however, I want to try to refine the output and work with the words and probabilities. According to the docs I can get to this, by removing the ["text"] in the last 4 lines of code.
def text_from_audio_v4( path, file, lang, location):
os.chdir("D:/OneDrive/DataSci/Tennis/02_Preprocessing/Voice/VSOK/" + location)
from pydub import AudioSegment
# wf = AudioSegment.from_file(path + file)
wf = wave.open(path + file, "rb")
model = Model("model")
if lang == "English":
rec = KaldiRecognizer(model, wf.getframerate(),
'["fifteen","love", "mistake" ]')
results = []
while True:
data = wf.readframes(4000)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
results.append(json.loads(rec.Result()))
results.append(json.loads(rec.FinalResult()))
pprint.pprint(results)
return results
However, now the output is in a format that I am really struggling to interpret. I don't think this is pure json format, though I know VOSK does output JSON. I think I am doing something out in creating "results".
What I would like is a dataframe where conf, end, start and word are columns, and they are iteratively added to. Can you help me? If you want to run on the examples I have to get the results I have, then the audio files are here
UPDATE:
I have a solution but it requires iterating over data which I know is inefficient. Any improved solutions are welcome.
res = []
for i in range(len(out)):
if "result" in out[i]:
res.append(out[i]["result"])
flattenedres = [val for sublist in res for val in sublist]
pd.DataFrame(flattenedres)