the JSON object must be str, bytes or bytearray, not NoneType

Question

This program takes html file from input directory and translate it to hindi using googletrans.

import os
from bs4 import BeautifulSoup
from googletrans import Translator

# Set the input and output directories
input_dir = r"C:\My Web Sites\CC\www.classcentral.com\subject"
output_dir = r"C:\My Web Sites\CC\www.classcentral.com\translated\subject"

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Create the translator object
translator = Translator(service_urls=['translate.google.com'])

# Iterate through all HTML files in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith('.html'):
        # Read in the input file
        with open(os.path.join(input_dir, filename), 'r', encoding='latin-1') as f:
            # Parse the HTML using BeautifulSoup
            soup = BeautifulSoup(f, 'html.parser')

            # Translate the text in the HTML
            for element in soup.find_all(text=True):
                if element.strip():  # Skip empty strings
                    try:
                        translated_text = translator.translate(element.string, dest='hi').text
                        element.string.replace_with(translated_text)
                    except:
                        print("Translation failed for element: ", element)

        # Write out the translated HTML to a new file in the output directory
        with open(os.path.join(output_dir, filename), 'w', encoding='latin-1') as f:
            f.write(str(soup))
            print(f"Translated file '{filename}' written to '{output_dir}'")

I am gettig an error:

  File "e:\Webscraping\Translate1.py", line 36, in <module>
    translation = translator.translate(element.string, dest='hi')
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Python311\Lib\site-packages\googletrans\client.py", line 219, in translate
    parsed = json.loads(data[0][2])
             ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Python311\Lib\json\__init__.py", line 339, in loads
    raise TypeError(f'the JSON object must be str, bytes or bytearray, '
TypeError: the JSON object must be str, bytes or bytearray, not NoneType

During the handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "e:\Webscraping\Translate1.py", line 44, in <module>
    print("Translation failed for element: ", element)
  File "C:\Python311\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode character '\u2192' in position 178878: character maps to <undefined>

I cannot pin point the reason behind the error. Does someone know the fix? For the 2nd error I have used utf-8 16, and 32 also latin-1 still it is giving the same error.

regarding the 2nd error, see: https://stackoverflow.com/questions/27092833/unicodeencodeerror-charmap-codec-cant-encode-characters — slothrop, Feb 19 '23 at 11:33
Based on that, maybe try `element.encode("utf-8")` instead of `element` in your print statement? — slothrop, Feb 19 '23 at 11:34
regarding the 1st error: https://github.com/ssut/py-googletrans/issues/301 suggests that this happens when the content to be translated is too long — slothrop, Feb 19 '23 at 11:48
see also: https://stackoverflow.com/questions/70345510/googletrans-error-with-dictionary-translate — slothrop, Feb 19 '23 at 11:49

score 0 · Answer 1 · answered Feb 19 '23 at 14:45

So I changed the code a bit. So to solve the encoding error I used cardet to detect the encoding of the file and then reopened the file with detected encoding. Here's the code:

import os
import chardet
from bs4 import BeautifulSoup
from googletrans import Translator
import logging

# Set up logging
logging.basicConfig(filename='translation.log', level=logging.DEBUG)

# Set the input and output directories
input_dir = r"C:\My Web Sites\CC\www.classcentral.com\institution"
output_dir = r"C:\My Web Sites\CC\www.classcentral.com\translated\institution"

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Create the translator object
translator = Translator(service_urls=['translate.google.com'])

# Iterate through all HTML files in the input directory
for filename in os.listdir(input_dir):
    if filename.endswith('.html'):
        # Read in the input file
        with open(os.path.join(input_dir, filename), 'rb') as f:
            # Detect the encoding of the file
            encoding = chardet.detect(f.read())['encoding']
            
            # Re-open the file with the detected encoding
            f.seek(0)
            text = f.read().decode(encoding)
            soup = BeautifulSoup(text, 'html.parser')

            # Translate the text in the HTML
            for element in soup.find_all(text=True):
                if element.strip():  # Skip empty strings
                    try:
                        translated_text = translator.translate(element.string, dest='hi').text
                        element.string.replace_with(translated_text)
                    except Exception as e:
                        logging.error(f"Translation failed for element: {element} with error: {e}")

        # Write out the translated HTML to a new file in the output directory
        with open(os.path.join(output_dir, filename), 'w', encoding='utf-8') as f:
            f.write(str(soup))
            logging.info(f"Translated file '{filename}' written to '{output_dir}'")

the JSON object must be str, bytes or bytearray, not NoneType

1 Answers1