I am learning how to detect Korean text, for sample I am using Korean text present in back of package, but pytesseract.image_to_string(img_pl,lang='kor') is not able to segregate words separately when I query with level set to word Here is my code
import numpy as np
import pandas as pd
import cv2
import PIL
import pytesseract
import html
import io
import os
img_cv2 = cv2.imread('/Users/aniketdeshmukh/Desktop/Korean Text Images/0.png')
img_pl = PIL.Image.open('/Users/aniketdeshmukh/Desktop/Korean Text Images/0.png')
text_pl = pytesseract.image_to_string(img_cv2,lang='kor')
text_pl = pytesseract.image_to_string(img_pl,lang='kor')
#print(text_pl)
data = pytesseract.image_to_data(img_pl,lang='kor')
dataList = list(map(lambda x: x.split('\t'),data.split('\n')))
df = pd.DataFrame(dataList[1:],columns=dataList[0])
#df
df.info
df.dropna(inplace=True)
col_int = ['level','page_num','block_num','par_num','line_num','word_num','left','top','width','height','conf']
df[col_int] = df[col_int].astype(int)
img_cv2 = cv2.imread('/Users/aniketdeshmukh/Desktop/Korean Text Images/0.png')
image = img_cv2.copy()
level = 'word'
for l,x,y,w,h,c in df[['level','left','top','width','height','conf']].values:
if level == 'page':
if l == 1:
cv2.rectangle(image,(x,y),(x+w,y+h),(0,0,0),2)
else:
continue
elif level == 'block':
if l == 2:
cv2.rectangle(image,(x,y),(x+w,y+h),(255,0,0),2)
else:
continue
elif level == 'para':
if l == 3:
cv2.rectangle(image,(x,y),(x+w,y+h),(0,255,0),2)
else:
continue
elif level == 'line':
if l == 4:
cv2.rectangle(image,(x,y),(x+w,y+h),(0,0,255),2)
else:
continue
elif level == 'word':
if l == 5:
cv2.rectangle(image,(x,y),(x+w,y+h),(0,255,0),2)
else:
continue
cv2.imshow("bounding box",image)
cv2.waitKey(0)
cv2.destroyAllWindows()
output I get is following even when level is set to 'word'