I am trying to code code that downloads a PDF from a URL. I found a method of doing this, but it was not written in Python 3 and used the file()
function.
I tried replacing this with open()
in the line fp = open(path, 'rb')
.
However I get this error:
TypeError: expected str, bytes or os.PathLike object, not HTTPResponse.
I cant find a solution online. Any help would be appreciated. Here is the code:
import bs4 as bs
import urllib.request
from urllib.request import urlopen
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
from io import StringIO
from io import open
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
stri = retstr.getvalue()
retstr.close()
return stri
pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf");
outputString = convert_pdf_to_txt(pdfFile)
print(outputString)
pdfFile.close()
Resources used
http://zempirians.com/ebooks/Ryan%20Mitchell-Web%20Scraping%20with%20Python_%20Collecting%20Data%20from%20the%20Modern%20Web-O'Reilly%20Media%20(2015).pdf (page 101)
Extracting text from a PDF file using PDFMiner in python? (the top answer)