I am looking to extract text from a license plate. For now I have been using pytesseract with opencv to zero in on the relevant contours and pull out text. This works decently for non-American plates, but I am curious about applying this to American plates which come with a lot of little letters surrounding the big plate id ones. My thoughts were to use font size to filter out letters under a certain threshold. Is that the best approach?
below is code so far:
import cv2
import pytesseract
import imutils
#read image
image = cv2.imread('plateTest2.jpeg')
#RGB to Gray Scale converstion
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
#noise removal
gray = cv2.bilateralFilter(gray,11,17,17)
#find edges of the grayscale image
edged = cv2.Canny(gray, 170,200)
#Find contours based on Edges
_,cnts, new = cv2.findContours(edged.copy(), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
#Create copy of original image to draw all contours
img1 = image.copy()
cv2.drawContours(img1, cnts, -1, (0,255,0), 3)
#sort contours based on their area keeping minimum required area as '30' (anything smaller than this will not be considered)
cnts=sorted(cnts, key = cv2.contourArea, reverse = True)[:30]
NumberPlateCnt = None #we currently have no Number plate contour
#Top 30 Contours
img2 = image.copy()
cv2.drawContours(img2, cnts, -1, (0,255,0), 3)
idx='plateTest2.jpg'
for c in cnts:
peri = cv2.arcLength(c, True)
approx = cv2.approxPolyDP(c, 0.02 * peri, True)
# print ("approx = ",approx)
if len(approx) == 4: # Select the contour with 4 corners
NumberPlateCnt = approx #This is our approx Number Plate Contour
# Crop those contours and store it in Cropped Images folder
x, y, w, h = cv2.boundingRect(c) #This will find out co-ord for plate
new_img = gray[y:y + h, x:x + w] #Create new image
cv2.imwrite('/' + 'cropped_' + str(idx), new_img) #Store new image
#idx+=1
break
#Drawing the selected contour on the original image
#print(NumberPlateCnt)
cv2.drawContours(image, [NumberPlateCnt], -1, (0,255,0), 3)
Cropped_img_loc = '/' + 'cropped_' + str(idx)#'cropped_images/8.png'
#Use tesseract to covert image into string
text = pytesseract.image_to_string(Cropped_img_loc, lang='eng')
text = text.replace('.', '')
text = text.replace(' ','')
return text
Here is a picture of a plate that returns too much text where things like 'SUNSHINESTATE' show up:
Should I rely on pytesseract to identify font size and filter out smaller characters? Or should I be filtering before using contour size? Appreciate the help.