OCR - pytesseract issue

Question

I'm trying to apply below code:

import cv2
import numpy as np
import pytesseract
from PIL import Image

# Path of working folder on Disk
src_path = "C:/TEST/"

def get_string(img_path):
    # Read image with opencv
    img = cv2.imread(img_path)

    # Convert to gray
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Apply dilation and erosion to remove some noise
    kernel = np.ones((1, 1), np.uint8)
    img = cv2.dilate(img, kernel, iterations=1)
    img = cv2.erode(img, kernel, iterations=1)

    # Write image after removed noise
    cv2.imwrite(src_path + "removed_noise.png", img)

    #  Apply threshold to get image with only black and white
    #img = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2)

    # Write the image after apply opencv to do some ...
    cv2.imwrite(src_path + "thres.png", img)

    # Recognize text with tesseract for python
    result = pytesseract.image_to_string(Image.open(src_path + "thres.png"))

    # Remove template file
    #os.remove(temp)

    return result


print '--- Start recognize text from image ---'
print get_string(src_path + "textArea01.png")

But in return I get

Traceback (most recent call last):
  File "C:/Python27/erw.py", line 40, in <module>
    print get_string(src_path + "textArea01.png")
  File "C:/Python27/erw.py", line 31, in get_string
    result = pytesseract.image_to_string(Image.open(src_path + "thres.png"))
  File "C:\Python27\lib\site-packages\pytesseract\pytesseract.py", line 122, in image_to_string
    config=config)
  File "C:\Python27\lib\site-packages\pytesseract\pytesseract.py", line 46, in run_tesseract
    proc = subprocess.Popen(command, stderr=subprocess.PIPE)
  File "C:\Python27\lib\subprocess.py", line 390, in __init__
    errread, errwrite)
  File "C:\Python27\lib\subprocess.py", line 640, in _execute_child
    startupinfo)

* I have tried to install tesseract-ocr

but it ends up for me:

Command "c:\python27\python.exe -u -c "import setuptools, tokenize;__file__='c:\
\users\\xyz~1\\appdata\\local\\temp\\pip-build-bvk9mm\\tesseract-ocr\\setup.p
y';f=getattr(tokenize, 'open', open)(__file__);code=f.read().replace('\r\n', '\n
');f.close();exec(compile(code, __file__, 'exec'))" install --record c:\users\df
asto~1\appdata\local\temp\pip-vcr3xw-record\install-record.txt --single-version-
externally-managed --compile" failed with error code 1 in c:\users\xyz~1\appd
ata\local\temp\pip-build-bvk9mm\tesseract-ocr\

#

When Im trying different code:

from PIL import Image
from pytesseract import image_to_string

im = image_to_string(Image.open("c:/Python36/Projekty/textArea01.png"))
print(im)

same story as above:

Traceback (most recent call last):
  File "C:\Python36\Projekty\OCR_v1.py", line 6, in <module>
    im = image_to_string(Image.open("c:/Python36/Projekty/textArea01.png"))
  File "C:\Python36\lib\site-packages\pytesseract\pytesseract.py", line 122, in image_to_string
    config=config)
  File "C:\Python36\lib\site-packages\pytesseract\pytesseract.py", line 46, in run_tesseract
    proc = subprocess.Popen(command, stderr=subprocess.PIPE)
  File "C:\Python36\lib\subprocess.py", line 707, in __init__
    restore_signals, start_new_session)
  File "C:\Python36\lib\subprocess.py", line 990, in _execute_child
    startupinfo)
FileNotFoundError: [WinError 2]

Question is - how to come out from above issues | and why I see these issues in 1st time

score 0 · Answer 1 · answered Jun 23 '17 at 04:11

I didn't have any issues with installing tesseract but I leveraged the Tesseract at UB Mannheim installer:

https://github.com/UB-Mannheim/tesseract/wiki

You will also need to install pytesseract:

pip3.6 install pytesseract

I use the following code with no issues:

import numpy as np
from numpy import *
from PIL import Image
from PIL import *
import pytesseract
import cv2


src_path = "C:\\Users\\USERNAME\\Documents\\OCR\\"


def get_string(img_path):
    # Read image with opencv
    img = cv2.imread(img_path)
    # Convert to gray
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Apply dilation and erosion to remove some noise
    kernel = np.ones((1, 1), np.uint8)
    img = cv2.dilate(img, kernel, iterations=1)
    img = cv2.erode(img, kernel, iterations=1)
    # Write image after removed noise
    cv2.imwrite(src_path + "removed_noise.png", img)
    #  Apply threshold to get image with only black and white
    #img = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2)
    # Write the image after apply opencv to do some ...
    cv2.imwrite(src_path + "thres.png", img)
    # Recognize text with tesseract for python

    result = pytesseract.image_to_string(Image.open(src_path + "thres.png"))

    return result

def main():
    #Output results
    print ("OCR Output: ")
    print (get_string(src_path + "test.png"))

Create an image file with the extension of .png and make sure you modify the code to have the correct path and image name.

To ensure that the OCR works properly, I recommend making the text (or text in the image) equivalent to size 16 font. I had issues with smaller size font being accurate or not being read at all.

OCR - pytesseract issue

1 Answers1