0

This program takes html file from input directory and translate it to hindi using googletrans.

import os
from bs4 import BeautifulSoup
from googletrans import Translator

# Set the input and output directories
input_dir = r"C:\My Web Sites\CC\www.classcentral.com\subject"
output_dir = r"C:\My Web Sites\CC\www.classcentral.com\translated\subject"

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Create the translator object
translator = Translator(service_urls=['translate.google.com'])

# Iterate through all HTML files in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith('.html'):
        # Read in the input file
        with open(os.path.join(input_dir, filename), 'r', encoding='latin-1') as f:
            # Parse the HTML using BeautifulSoup
            soup = BeautifulSoup(f, 'html.parser')

            # Translate the text in the HTML
            for element in soup.find_all(text=True):
                if element.strip():  # Skip empty strings
                    try:
                        translated_text = translator.translate(element.string, dest='hi').text
                        element.string.replace_with(translated_text)
                    except:
                        print("Translation failed for element: ", element)

        # Write out the translated HTML to a new file in the output directory
        with open(os.path.join(output_dir, filename), 'w', encoding='latin-1') as f:
            f.write(str(soup))
            print(f"Translated file '{filename}' written to '{output_dir}'")

I am gettig an error:

  File "e:\Webscraping\Translate1.py", line 36, in <module>
    translation = translator.translate(element.string, dest='hi')
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Python311\Lib\site-packages\googletrans\client.py", line 219, in translate
    parsed = json.loads(data[0][2])
             ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Python311\Lib\json\__init__.py", line 339, in loads
    raise TypeError(f'the JSON object must be str, bytes or bytearray, '
TypeError: the JSON object must be str, bytes or bytearray, not NoneType

During the handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "e:\Webscraping\Translate1.py", line 44, in <module>
    print("Translation failed for element: ", element)
  File "C:\Python311\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\u2192' in position 178878: character maps to <undefined>

I cannot pin point the reason behind the error. Does someone know the fix? For the 2nd error I have used utf-8 16, and 32 also latin-1 still it is giving the same error.

  • regarding the 2nd error, see: https://stackoverflow.com/questions/27092833/unicodeencodeerror-charmap-codec-cant-encode-characters – slothrop Feb 19 '23 at 11:33
  • Based on that, maybe try `element.encode("utf-8")` instead of `element` in your print statement? – slothrop Feb 19 '23 at 11:34
  • regarding the 1st error: https://github.com/ssut/py-googletrans/issues/301 suggests that this happens when the content to be translated is too long – slothrop Feb 19 '23 at 11:48
  • see also: https://stackoverflow.com/questions/70345510/googletrans-error-with-dictionary-translate – slothrop Feb 19 '23 at 11:49

1 Answers1

0

So I changed the code a bit. So to solve the encoding error I used cardet to detect the encoding of the file and then reopened the file with detected encoding. Here's the code:

import os
import chardet
from bs4 import BeautifulSoup
from googletrans import Translator
import logging

# Set up logging
logging.basicConfig(filename='translation.log', level=logging.DEBUG)

# Set the input and output directories
input_dir = r"C:\My Web Sites\CC\www.classcentral.com\institution"
output_dir = r"C:\My Web Sites\CC\www.classcentral.com\translated\institution"

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Create the translator object
translator = Translator(service_urls=['translate.google.com'])

# Iterate through all HTML files in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith('.html'):
        # Read in the input file
        with open(os.path.join(input_dir, filename), 'rb') as f:
            # Detect the encoding of the file
            encoding = chardet.detect(f.read())['encoding']
            
            # Re-open the file with the detected encoding
            f.seek(0)
            text = f.read().decode(encoding)
            soup = BeautifulSoup(text, 'html.parser')

            # Translate the text in the HTML
            for element in soup.find_all(text=True):
                if element.strip():  # Skip empty strings
                    try:
                        translated_text = translator.translate(element.string, dest='hi').text
                        element.string.replace_with(translated_text)
                    except Exception as e:
                        logging.error(f"Translation failed for element: {element} with error: {e}")

        # Write out the translated HTML to a new file in the output directory
        with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as f:
            f.write(str(soup))
            logging.info(f"Translated file '{filename}' written to '{output_dir}'")