I'm testing out paddle OCR for tables.
First I'm downloading all the necessery weights:
# Download the PP-OCRv3 text detection model and unzip it
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar && tar xf en_PP-OCRv3_det_infer.tar
# Download the PP-OCRv3 text recognition model and unzip it
wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_rec_infer.tar && tar xf en_PP-OCRv3_rec_infer.tar
# Download the PP-StructureV2 form recognition model and unzip it
wget https://paddleocr.bj.bcebos.com/ppstructure/models/slanet/en_ppstructure_mobile_v2.0_SLANet_infer.tar && tar xf en_ppstructure_mobile_v2.0_SLANet_infer.tar
And also using the recommended version:
pip install "paddleocr>=2.0.1"
I've modified the predict_table.py
file based upon my need (rest is available @ https://github.com/PaddlePaddle/PaddleOCR/blob/b318d204fa4b984a76edee167a5cb867e5b85838/ppstructure/table/predict_table.py):
def process_image_and_extract_table(det_model_dir, rec_model_dir, table_model_dir,rec_char_dict_path, table_char_dict_path, image_path, output_dir):
args = parse_args()
args.det_model_dir = det_model_dir
args.rec_model_dir = rec_model_dir
args.table_model_dir = table_model_dir
args.rec_char_dict_path = rec_char_dict_path
args.table_char_dict_path = table_char_dict_path
args.image_dir = image_path
args.output = output_dir
os.makedirs(args.output, exist_ok=True)
table_sys = TableSystem(args)
output_path = table_sys.extract_tables(image_path)
return output_path
And calling this function like this:
import os
from PIL import Image
from paddleocr import PaddleOCR,draw_ocr
# import table.meaw as meaw
# from table import meaw
# from table.predict_table import process_image_and_extract_table
from predict_table import process_image_and_extract_table
c_dir = "files"
# print(os.listdir(c_dir))
files = [os.path.join(c_dir,x) for x in os.listdir(c_dir) if x.endswith(".PNG")]
print(files)
ocr = PaddleOCR(use_angle_cls=True, lang='en')
for file in files:
img_file = file + ".tem.png"
print(file)
result = ocr.ocr(file, cls=True)
# result = ocr.ocr(file, cls=True, type='structure')
print("---------------------------------------")
print(result)
print(len(result))
print(len(result[0]))
print("***************************************")
result = result[0]
image = Image.open(file).convert('RGB')
boxes = [line[0] for line in result]
txts = [line[1][0] for line in result]
scores = [line[1][1] for line in result]
im_show = draw_ocr(image, boxes, txts, scores, font_path='./Roboto-Thin.ttf')
im_show = Image.fromarray(im_show)
im_show.save(img_file)
output_path = process_image_and_extract_table(
det_model_dir='./en_PP-OCRv3_det_infer',
rec_model_dir='./en_PP-OCRv3_rec_infer',
table_model_dir='./en_ppstructure_mobile_v2.0_SLANet_infer',
rec_char_dict_path='./ppocr_keys_v1.txt',
table_char_dict_path='./table_structure_dict.txt',
image_path=file,
output_dir='./out'
)
print("Table extracted and saved to:", output_path)
# im_show.show("demo")
# break
I've downloaded other necessery files from here:
- https://github.com/PaddlePaddle/PaddleOCR/blob/b318d204fa4b984a76edee167a5cb867e5b85838/ppocr/utils/ppocr_keys_v1.txt (don't know what is this, their codebase isn't that much documented;)
- https://github.com/PaddlePaddle/PaddleOCR/blob/b318d204fa4b984a76edee167a5cb867e5b85838/ppocr/utils/dict/table_structure_dict.txt
Now as a result, I'm getting chinese outputs. How can I resolve this issue??