I am working on a small project that is reading text from images. It is taking almost 10 hrs for 20k images now, running on Ubuntu 14.0 Below is a part of the code that is taking quite a lot of time. Please advice /help in increasing the speed or how to use multiprocessing for the below mentioned code?
Regards, Shri
Code:
row = 1
for f1 in files3:
if f1.lower().endswith( ('.png', '.jpg', '.jpeg') ):
try:
image_path1 = files_dir3 + '/' + f1
txt = pytesseract.image_to_string( Image.open( image_path1 ) )
print (txt)
if txt != '':
print ('0')
worksheet4.write( row, 1, '0' )
worksheet4.write( row, 2, txt )
worksheet4.write( row, 0, image_path1 )
else:
worksheet4.write( row, 1, '1' )
worksheet4.write( row, 2, 'No Text On Image' )
worksheet4.write( row, 0, image_path1 )
image_path3 = files_dir3 + '/' + f1
img = cv2.imread( image_path3 )
mask = np.zeros( img.shape[:2], np.uint8 )
bgdModel = np.zeros( (1, 65), np.float64 )
fgdModel = np.zeros( (1, 65), np.float64 )
rect = (50, 50, 450, 290)
cv2.grabCut( img, mask, rect, bgdModel, fgdModel, 5, cv2.GC_INIT_WITH_RECT )
mask2 = np.where( (mask == 2) | (mask == 0), 0, 1 ).astype( 'uint8' )
img = img * mask2[:, :, np.newaxis]
# plt.imshow(img),plt.colorbar(),plt.show()
cv2.imwrite( os.path.join( files_dir1, f1 ), img )
print ("Image copied: " + f1)
if f2.lower().endswith( ('.png', '.jpg', '.jpeg') ):
# image_path1 = files_dir + '\\' + f
image_path2 = files_dir1 + '/' + f2
print (f2)
txt = pytesseract.image_to_string( Image.open( image_path2 ) )
print (txt)
if txt != '':
print ('0')
worksheet4.write( row, 1, '0' )
worksheet4.write( row, 2, txt )
worksheet4.write( row, 0, image_path2 )
else:
print ('1')
worksheet4.write( row, 1, '1' )
worksheet4.write( row, 2, 'No Text On Image' )
worksheet4.write( row, 0, image_path2 )
except:
print ("Moving On")
row += 1