I have a few research papers in pdf format and I want to extract just the introduction/background etc from the paper. also, I can only use python. can someone please help?
Asked
Active
Viewed 1,435 times
1
-
what did you try? – Mahir Islam Aug 12 '18 at 10:43
-
@mahir i tried converting the pdf into text format and then used regex to extract paragraphs.. but it isnt very accurate. – Cheryl Aug 12 '18 at 13:08
-
it would be good to share what you have tried, because then the error or bug could be fixed – Mahir Islam Aug 12 '18 at 13:39
-
Please post code of what you have tried so far. – Oct 30 '19 at 16:08
1 Answers
0
I got help, right here, with something similar a couple weeks back. It can be easy, or VERY HARD, to work with PDF files, and there are all different kinds of PDF files. Having said that, you should consider converting all PDF files to text files. Try the code sample below.
First, convert PDFs to text.
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt
#converts pdf, returns its text content as a string
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = io.StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
#converts all pdfs in directory pdfDir, saves all resulting txt files to txtdir
def convertMultiple(pdfDir, txtDir):
if pdfDir == "": pdfDir = os.getcwd() + "\\" #if no pdfDir passed in
for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory
fileExtension = pdf.split(".")[-1]
if fileExtension == "pdf":
pdfFilename = pdfDir + pdf
text = convert(pdfFilename) #get string of text content of pdf
textFilename = txtDir + pdf + ".txt"
textFile = open(textFilename, "w") #make text file
textFile.write(text) #write text to text file
# set paths accordingly:
pdfDir = "C:/your_path_here/PDF_in/"
txtDir = "C:/your_path_here/TEXT_out/"
convertMultiple(pdfDir, txtDir)
Second, look for all text between a beginning tag ("New York State Real Property Law") and an ending tag ("common elements of the property.").
# Loop through all TEXT files in a folder
# Pull out all text between two anchors: "New York State Real Property Law" & "common elements of the property."
import re
import os
myRegex=re.compile("New York State Real Property Law.*?common elements of the property\.",re.DOTALL)
for foldername,subfolders,files in os.walk(r"C:/your_path_here/text_files/"):
for file in files:
print(file)
object=open(os.path.join(foldername,file))
Text=object.read()
for subText in myRegex.findall(Text):
print(subText)
object.close()
Perhaps you can do all the work without converting PDFs to text files, but I haven't found any way to do it.

ASH
- 20,759
- 19
- 87
- 200