1

I used the code below to convert PDF data to XML data and write the conversion to a XML file. It is quite well known (it uses the PDFminer module) and works very well for PDF to text and HTML conversions but I have a problem when I do PDF to XML conversion. I'm quite a novice and some help from you would be super nice :)

Voici le code :

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter, XMLConverter, HTMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO

def convert_pdf(path, format='text', codec='utf-8', password=''):
    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    laparams = LAParams()
    if format == 'text':
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    elif format == 'html':
        device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    elif format == 'xml':
        device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    else:
        raise ValueError('provide format, either text, html or xml!')
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue().decode()
    fp.close()
    device.close()
    retstr.close()
    return text


path_pdf = ...
path_xml = ...
open(path_xml, "w").close()
text_output = convert_pdf(path_pdf)
open(path_xml, "a", encoding="utf-8").write(text_output)

And here's the error I get: My error

Thank you in advance!

Hasitha Jayawardana
  • 2,326
  • 4
  • 18
  • 36
timoutcha
  • 11
  • 1
  • 2

1 Answers1

0

If you are using python2.7, the following works

from io import BytesIO
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage


def get_xml_py2(file_path):
    in_fp = BytesIO()
    with open(file_path, 'rb') as x:
        in_fp.write(x.read())

    laparams = LAParams(all_texts=True)
    rsrcmgr = PDFResourceManager()
    for page in PDFPage.get_pages(in_fp):
        outfp = BytesIO()
        device = XMLConverter(rsrcmgr, outfp, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        interpreter.process_page(page)
        yield outfp.getvalue()
        device.close()
        outfp.close()

    in_fp.close()