We are trying to extract Invoice Data (Pdf/Image) using Deep learning libraries i.e OpenCv or any other one.
We are getting multiple Invoices in the form of PDF or Images on the daily basis, from which we have to capture certain fields like Bill No, Vendor Name, Date of Billing, Total Amount Due, Taxes applicable etc. Hence trying to create a Deep Learning model Which can accurately extract the Values from the documents.
We have tried to use PyTesseract, PyPDF2, PdfMiner but not getting the exact output in the from of JSON from it.
INPUT: It can be aby Invoice document as we have to work with random Invoices (PDF/Image)
What we are trying to integrate but don't have the idea how the Deep learning model can be integrated.
from pdf2image import convert_from_path
pdfs = r"provide path to pdf file"
pages = convert_from_path(pdfs, 350)
i = 1
for page in pages:
image_name = "Page_" + str(i) + ".jpg"
page.save(image_name, "JPEG")
i = i+1
Code for marking ROI in image
import cv2
from PIL import Image
def mark_region(imagE_path):
im = cv2.imread(image_path)
gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (9,9), 0)
thresh = cv2.adaptiveThreshold(blur,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,11,30)
# Dilate to combine adjacent text contours
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9,9))
dilate = cv2.dilate(thresh, kernel, iterations=4)
# Find contours, highlight text areas, and extract ROIs
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
line_items_coordinates = []
for c in cnts:
area = cv2.contourArea(c)
x,y,w,h = cv2.boundingRect(c)
if y >= 600 and x <= 1000:
if area > 10000:
image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3)
line_items_coordinates.append([(x,y), (2200, y+h)])
if y >= 2400 and x<= 2000:
image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3)
line_items_coordinates.append([(x,y), (2200, y+h)])
return image, line_items_coordinates
Crop an image and perform OCR
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\NK\AppData\Local\Tesseract-OCR\tesseract.exe'
# load the original image
image = cv2.imread('Original_Image.jpg')
# get co-ordinates to crop the image
c = line_items_coordinates[1]
# cropping image img = image[y0:y1, x0:x1]
img = image[c[0][1]:c[1][1], c[0][0]:c[1][0]]
plt.figure(figsize=(10,10))
plt.imshow(img)
# convert the image to black and white for better OCR
ret,thresh1 = cv2.threshold(img,120,255,cv2.THRESH_BINARY)
# pytesseract image to string to get results
text = str(pytesseract.image_to_string(thresh1, config='--psm 6'))
print(text)
But How we can capture the Specific Field values from the PDF,
Output:
For each invoice like the one above, need the output for each Documents. Example be something like this.
{
"Bill No":"INXXXXXXXX",
"Vendor Name":"260",
"Taxes applicable":"29.XX",
"Date of Billing":"3-12-1995",
"Total Amount Due":"258.93"
}
Need Suggestion on this topic.