I'm trying to convert all type of images in a folder to text using python tesseract. Below is the that I'm using, with this only .png files are being converted to .txt, and other types are not being converted to text.
import os
import pytesseract
import cv2
import re
import glob
import concurrent.futures
import time
def ocr(img_path):
out_dir = "Output//"
img = cv2.imread(img_path)
text = pytesseract.image_to_string(img,lang='eng',config='--psm 6')
out_file = re.sub(".png",".txt",img_path.split("\\")[-1])
out_path = out_dir + out_file
fd = open(out_path,"w")
fd.write("%s" %text)
return out_file
os.environ['OMP_THREAD_LIMIT'] = '1'
def main():
path = input("Enter the path : ")
if os.path.isdir(path) == 1:
out_dir = "ocr_results//"
if not os.path.exists(out_dir):
os.makedirs(out_dir)
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
image_list = glob.glob(path+"\\*.*")
for img_path,out_file in zip(image_list,executor.map(ocr,image_list)):
print(img_path.split("\\")[-1],',',out_file,', processed')
if __name__ == '__main__':
start = time.time()
main()
end = time.time()
print(end-start)
How to convert all type of image files to text. Please help me with the above code.