I'm trying to convert a PDF file into HTML format using HTML Converter. Provided below is the code that I'm using.
from django.conf import settings
settings.configure(PDF_MINER_IS_STRICT = True)
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import HTMLConverter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import re
import csv
def convert_pdf_to_html(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0 #is for all
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue().encode('utf-8')
retstr.close()
return str.encode('utf-8')
On running though, I get the following error:
Traceback (most recent call last):
File "convertPDFToHTML.py", line 32, in <module>
print convert_pdf_to_html(path)
File "convertPDFToHTML.py", line 18, in convert_pdf_to_html
device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
File "C:\Python27\lib\site-packages\pdfminer\converter.py", line 247, in __init__
PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
File "C:\Python27\lib\site-packages\pdfminer\converter.py", line 167, in __init__
self.outfp.write(u"é")
UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 0: ordinal not in range(128)
I'm not sure how to make the 'converter.py' of HTMLConverter to follow unicode encoding.
Please help!