Funny is Speech Recognition
uses Google Speech-To-Text
and it has to get wav
but documentation for Google Speech-To-Text API
shows that it can works also with mp3
and few other formats. See all supported audio encodings
When I checked source code for Speech Recognition
then I saw it gets wav
but it converts it to flac
before sending to Google Speech-To-Text
.
You can try to use directly Speech-To-Text API
but this may need to register own application on Google to get API Key
. See more Speech-To-Text
EDIT:
I took source code in which Speech Recognition
uses Google Speech-To-Text
and and I took some code from Google documentation and I created own version which can send directly mp3
.
It use API Key from Speech Recognition
- 'AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw'
import requests
import base64
#filename = 'test/audio/audio2-hello-world-of-python.wav'
filename = 'test/audio/audio2-hello-world-of-python.mp3'
with open(filename, 'rb') as fh:
file_data = fh.read()
# --- Google Speech-To-Text ---
data = {
"audio": {
"content": base64.b64encode(file_data)
},
"config": {
"enableAutomaticPunctuation": True,
# "encoding": "LINEAR16", # WAV
"encoding": "MP3", # MP3
"languageCode": "en-US",
"model": "video",
}
}
payload = {
'key': 'AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw',
}
url = 'https://speech.googleapis.com/v1p1beta1/speech:recognize'
response = requests.post(url, params=payload, json=data)
#print(response.status_code)
#print(response.text)
data = response.json()
text = data['results'][0]['alternatives'][0]['transcript']
print(text)
In code I read file from disk but using io.Bytes
probably you get data from bot without writing on disk.
file = bot.getFile(update.message.voice.file_id)
with io.Bytes() as fh:
file.download(fh)
#fh.seek(0) # move to the beginning of file
#file_data = fh.read()
file_data = fh.getvalue()
EDIT:
Minimal working bot code - which I tested with uploaded files .mp3
(not with voice)
import os
import telegram
from telegram.ext import Updater, MessageHandler, CommandHandler, Filters
import requests
import base64
import io
# --- functions ---
def speech_to_text(file_data, encoding='LINEAR16', lang='en-US'):
data = {
"audio": {
"content": base64.b64encode(file_data)
},
"config": {
"enableAutomaticPunctuation": True,
# "encoding": "LINEAR16", # WAV
# "encoding": "MP3", # MP3
"encoding": encoding,
"languageCode": lang,
"model": "video",
}
}
payload = {
'key': 'AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw',
}
url = 'https://speech.googleapis.com/v1p1beta1/speech:recognize'
response = requests.post(url, params=payload, json=data)
#print('response:', response.text)
try:
data = response.json()
return data['results'][0]['alternatives'][0]['transcript']
except Exception as ex:
print('Exception:', ex)
print('response:', response.text)
#return None
#return None
# --- init ---
TOKEN = os.getenv('TELEGRAM_TOKEN')
bot = telegram.Bot(TOKEN)
updater = Updater(token=TOKEN, use_context=True)
dispatcher = updater.dispatcher
# --- commands ---
# - upload audio file -
def translate_audio(update, context):
print('translate_audio')
with io.BytesIO() as fh:
#context.bot.get_file(update.message.voice.file_id).download(out=fh)
context.bot.get_file(update.message.audio.file_id).download(out=fh)
file_data = fh.getvalue()
text = speech_to_text(file_data, 'MP3')
if not text:
text = "I don't understand this file"
update.message.reply_text(text)
dispatcher.add_handler(MessageHandler(Filters.audio, translate_audio))
# - record voice -
def translate_voice(update, context):
print('translate_voice')
with io.BytesIO() as fh:
context.bot.get_file(update.message.voice.file_id).download(out=fh)
#context.bot.get_file(update.message.audio.file_id).download(out=fh)
file_data = fh.getvalue()
text = speech_to_text(file_data, 'MP3')
if not text:
text = "I don't understand this file"
update.message.reply_text(text)
dispatcher.add_handler(MessageHandler(Filters.voice, translate_voice))
# --- start ---
print('starting ...')
updater.start_polling()
updater.idle()