This program takes html file from input directory and translate it to hindi using googletrans.
import os
from bs4 import BeautifulSoup
from googletrans import Translator
# Set the input and output directories
input_dir = r"C:\My Web Sites\CC\www.classcentral.com\subject"
output_dir = r"C:\My Web Sites\CC\www.classcentral.com\translated\subject"
# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Create the translator object
translator = Translator(service_urls=['translate.google.com'])
# Iterate through all HTML files in the input directory
for filename in os.listdir(input_dir):
if filename.endswith('.html'):
# Read in the input file
with open(os.path.join(input_dir, filename), 'r', encoding='latin-1') as f:
# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(f, 'html.parser')
# Translate the text in the HTML
for element in soup.find_all(text=True):
if element.strip(): # Skip empty strings
try:
translated_text = translator.translate(element.string, dest='hi').text
element.string.replace_with(translated_text)
except:
print("Translation failed for element: ", element)
# Write out the translated HTML to a new file in the output directory
with open(os.path.join(output_dir, filename), 'w', encoding='latin-1') as f:
f.write(str(soup))
print(f"Translated file '{filename}' written to '{output_dir}'")
I am gettig an error:
File "e:\Webscraping\Translate1.py", line 36, in <module>
translation = translator.translate(element.string, dest='hi')
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Python311\Lib\site-packages\googletrans\client.py", line 219, in translate
parsed = json.loads(data[0][2])
^^^^^^^^^^^^^^^^^^^^^^
File "C:\Python311\Lib\json\__init__.py", line 339, in loads
raise TypeError(f'the JSON object must be str, bytes or bytearray, '
TypeError: the JSON object must be str, bytes or bytearray, not NoneType
During the handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "e:\Webscraping\Translate1.py", line 44, in <module>
print("Translation failed for element: ", element)
File "C:\Python311\Lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\u2192' in position 178878: character maps to <undefined>
I cannot pin point the reason behind the error. Does someone know the fix? For the 2nd error I have used utf-8 16, and 32 also latin-1 still it is giving the same error.