I am building a simple PDF Extractor and mp3 converter. The first part that is not posted reads in a PDF and extracts the body and saves it in a .txt.
The second part takes the .txt file and uses gTTS to convert it to mp3. I have read that there is an upper limit of 5000 characters per query and a maximum of 1000 queries per minute. I have made sure that each query does not exceed 5000 characters.
However, the SSL/TLS connection to translate.google.com keeps closing. I am not that strong with SSL and have no idea what TLS is and would gladly get some material to read up on.
def convert_to_mp3(path, output_file, language='en', speed=1.0):
with open(path, 'r', encoding="utf-8") as f:
text = f.read()
sequences = split_body_into_sequences(text)
#print(sequences)
print(f'Converting {len(sequences)} sequences')
for i, sequence in enumerate(sequences):
print(f'Converting sequence {i+1} of {len(sequences)}')
print(f'Length of sequence: {len(sequence[0])} characters')
tts = gTTS(text=sequence[0], lang=language, tld="com")
tts.save(output_file[:-4] + f'_{i+1}.mp3')
return
def split_body_into_sequences(text):
"""
Max length 5000 characters, otherwise Google TTS will not work.
Split strings always at .,!,? or \n
"""
sequences = []
sequence = []
sentence = ''
for i, char in enumerate(text):
sentence += char
if sum(len(s) for s in sequence) + len(sentence) > 4500:
sequences.append([' '.join(sequence)])
sequence = []
else:
if char in ['.', '!', '?', '\n']:
sequence.append(sentence)
sentence = ''
sequences.append([' '.join(sequence)])
return sequences
I get the following output before it breaks:
Converting 17 sequences
Converting sequence 1 of 17
Length of sequence: 4503 characters
Converting sequence 2 of 17
Length of sequence: 4542 characters
and the first audio file is fine, length of approximately 6 min, whilst the second one is only generated until ~30 seconds. Then it breaks with the error:
Traceback (most recent call last):
File "C:\Users\yanni\anaconda3\envs\pdfreader\lib\site-packages\urllib3\connectionpool.py", line 703, in urlopen
httplib_response = self._make_request(
File "C:\Users\yanni\anaconda3\envs\pdfreader\lib\site-packages\urllib3\connectionpool.py", line 386, in _make_request
self._validate_conn(conn)
File "C:\Users\yanni\anaconda3\envs\pdfreader\lib\site-packages\urllib3\connectionpool.py", line 1042, in _validate_conn
conn.connect()
File "C:\Users\yanni\anaconda3\envs\pdfreader\lib\site-packages\urllib3\connection.py", line 414, in connect
self.sock = ssl_wrap_socket(
File "C:\Users\yanni\anaconda3\envs\pdfreader\lib\site-packages\urllib3\util\ssl_.py", line 449, in ssl_wrap_socket
ssl_sock = _ssl_wrap_socket_impl(
File "C:\Users\yanni\anaconda3\envs\pdfreader\lib\site-packages\urllib3\util\ssl_.py", line 493, in _ssl_wrap_socket_impl
return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
File "C:\Users\yanni\anaconda3\envs\pdfreader\lib\ssl.py", line 500, in wrap_socket
return self.sslsocket_class._create(
File "C:\Users\yanni\anaconda3\envs\pdfreader\lib\ssl.py", line 1040, in _create
self.do_handshake()
File "C:\Users\yanni\anaconda3\envs\pdfreader\lib\ssl.py", line 1309, in do_handshake
self._sslobj.do_handshake()
ssl.SSLZeroReturnError: TLS/SSL connection has been closed (EOF) (_ssl.c:1131)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\yanni\anaconda3\envs\pdfreader\lib\site-packages\requests\adapters.py", line 489, in send
resp = conn.urlopen(
File "C:\Users\yanni\anaconda3\envs\pdfreader\lib\site-packages\urllib3\connectionpool.py", line 787, in urlopen
retries = retries.increment(
File "C:\Users\yanni\anaconda3\envs\pdfreader\lib\site-packages\urllib3\util\retry.py", line 592, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='translate.google.com', port=443): Max retries exceeded with url: /_/TranslateWebserverUi/data/batchexecute (Caused by SSLError(SSLZeroReturnError(6, 'TLS/SSL connection has been closed (EOF) (_ssl.c:1131)')))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\yanni\anaconda3\envs\pdfreader\lib\site-packages\gtts\tts.py", line 265, in stream
r = s.send(
File "C:\Users\yanni\anaconda3\envs\pdfreader\lib\site-packages\requests\sessions.py", line 701, in send
r = adapter.send(request, **kwargs)
File "C:\Users\yanni\anaconda3\envs\pdfreader\lib\site-packages\requests\adapters.py", line 563, in send
raise SSLError(e, request=request)
requests.exceptions.SSLError: HTTPSConnectionPool(host='translate.google.com', port=443): Max retries exceeded with url: /_/TranslateWebserverUi/data/batchexecute (Caused by SSLError(SSLZeroReturnError(6, 'TLS/SSL connection has been closed (EOF) (_ssl.c:1131)')))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "main.py", line 102, in <module>
mode_selection(mode=args.mode, filename=args.file_name, output_file=args.output_path, flags=args.flags, language=args.language, speed=args.speed)
File "main.py", line 55, in mode_selection
convert_to_mp3(path, output_file, language, speed)
File "main.py", line 23, in convert_to_mp3
tts.save(output_file[:-4] + f'_{i+1}.mp3')
File "C:\Users\yanni\anaconda3\envs\pdfreader\lib\site-packages\gtts\tts.py", line 329, in save
self.write_to_fp(f)
File "C:\Users\yanni\anaconda3\envs\pdfreader\lib\site-packages\gtts\tts.py", line 310, in write_to_fp
for idx, decoded in enumerate(self.stream()):
File "C:\Users\yanni\anaconda3\envs\pdfreader\lib\site-packages\gtts\tts.py", line 281, in stream
raise gTTSError(tts=self)
gtts.tts.gTTSError: Failed to connect. Probable cause: Unknown
If needed I can provide the text file, the requirements and the rest of the codebase. My python version is 3.8.15 and my gtts version is 2.3.0.
I hope this post does not anger any stack overflow overlords, if yes please quickly tell me what I did wrong for future questions!