I am trying to extract information out of a PDF using PDFMiner in a consistent manner so I can do further analysis but I can't figure out how to correctly extract tabular data. PDF Miner seems to extract columns before rows. Has anyone solved this problem or know a way to extract rows first? I tried extracting it to html but I ran into the same problem. Any help is greatly appreciated.
Image from actual pdf:
Image from extracted version
The code I used for the extraction is below:
import nltk
import numpy
import pip
import pdfminer
import dateutil
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
fstr = ''
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
str = retstr.getvalue()
fstr += str
fp.close()
device.close()
retstr.close()
return fstr
test1 = convert_pdf_to_txt("C:\Users\User\Documents\Contract\Dental\Certificate - Dental - Assurant - 2010.pdf")