tesseract not able to detect korean language properly

Question

I am learning how to detect Korean text, for sample I am using Korean text present in back of package, but pytesseract.image_to_string(img_pl,lang='kor') is not able to segregate words separately when I query with level set to word Here is my code

import numpy as np
import pandas as pd
import cv2
import PIL
import pytesseract
import html
import io
import os
img_cv2 = cv2.imread('/Users/aniketdeshmukh/Desktop/Korean Text Images/0.png')
img_pl = PIL.Image.open('/Users/aniketdeshmukh/Desktop/Korean Text Images/0.png')
text_pl = pytesseract.image_to_string(img_cv2,lang='kor')
text_pl = pytesseract.image_to_string(img_pl,lang='kor')

#print(text_pl)
data = pytesseract.image_to_data(img_pl,lang='kor')
dataList = list(map(lambda x: x.split('\t'),data.split('\n')))
df = pd.DataFrame(dataList[1:],columns=dataList[0])
#df
df.info
df.dropna(inplace=True)
col_int = ['level','page_num','block_num','par_num','line_num','word_num','left','top','width','height','conf']
df[col_int] = df[col_int].astype(int)
img_cv2 = cv2.imread('/Users/aniketdeshmukh/Desktop/Korean Text Images/0.png')
image = img_cv2.copy()
level = 'word'
for l,x,y,w,h,c in df[['level','left','top','width','height','conf']].values:
    if level == 'page':
        if l == 1:
            cv2.rectangle(image,(x,y),(x+w,y+h),(0,0,0),2)
        else:
            continue
    elif level == 'block':
        if l == 2:
            cv2.rectangle(image,(x,y),(x+w,y+h),(255,0,0),2)
        else:
            continue
    elif level == 'para':
        if l == 3:
            cv2.rectangle(image,(x,y),(x+w,y+h),(0,255,0),2)
        else:
            continue
    elif level == 'line':
        if l == 4:
            cv2.rectangle(image,(x,y),(x+w,y+h),(0,0,255),2)
        else:
            continue
    elif level == 'word':
        if l == 5:
            cv2.rectangle(image,(x,y),(x+w,y+h),(0,255,0),2)
        else:
            continue

cv2.imshow("bounding box",image)
cv2.waitKey(0)
cv2.destroyAllWindows()

output I get is following even when level is set to 'word'

tesseract not able to detect korean language properly

0 Answers0