I am tryin to OCR an image which contains multiple language(English & Hindi) when I OCR it with py-tesseract the Hindi language words are translated into English words resulting in meaningless strings, so is there anyway to detect only English letters from an image and ignore other languages?
I tried langdetect libraries, re etc.. nothing helped
Here is the program
import pytesseract
from PIL import Image
[enter image description here][1]
def preprocess_image(image_path):
# Open the image
image = Image.open(image_path)
# Convert the image to grayscale
gray_image = image.convert("L")
# Apply any additional preprocessing steps if needed (e.g., thresholding)
return gray_image
def ocr_with_whitelist(image_path, whitelist):
# Specify the list of languages to use for OCR
languages = 'eng' # English
# Preprocess the image
preprocessed_image = preprocess_image(image_path)
# Perform OCR with specified languages
extracted_text = pytesseract.image_to_string(preprocessed_image, lang=languages)
# Filter out characters not in the whitelist
filtered_text = ''.join(char for char in extracted_text if char in whitelist)
return filtered_text
image_path = 'aadhaar_front.png'
whitelist = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789/:=, \n'
extracted_text = ocr_with_whitelist(image_path, whitelist)
print(extracted_text)