This may just be due to PyPdf2's
extract text function but when I run the code below in order to rename the files, a lot of the most common words come out like "Nthe", "Nfrom" and "Ncommunications". I'm not sure what I can do to stop this happening or alternatively how to work around it.
What causes a problem like this?
Where are the N's coming from?
Other PDFs do perfectly what I want so I'm not sure where to go from here.
import PyPDF2
import re
from collections import Counter
import os.path
files = [f for f in os.listdir('.') if os.path.isfile(f)]
files = filter(lambda f: f.endswith(('.pdf','.PDF')), files)
for file in files:
pdfFileObj = open('{0}'.format(file), 'rb') #Open the File
pdfReader = PyPDF2.PdfFileReader(pdfFileObj) #Read the file
frequency = {} #Create dict
ignore = {'the','a','if','in','it','of','or','and','for','can','that','are','this','your','you','will','not','have','its','with','need','has','from','more'} #Ignore dese ones
print "Number of Pages %s " % pdfReader.numPages #Print Num Pages
word_list = []
for i in range(pdfReader.numPages):
pageObj = pdfReader.getPage(i) # Get the first page
word_list.append(pageObj.extractText()) #Add the pages together
match_pattern = re.findall(r'\b[a-z]{3,15}\b', str(word_list)) #Find the text
cnt = Counter()
for words in match_pattern: #Start counting the frequency
words.lower() # Lower Case Words
if words not in ignore: #Ignore common words
count = frequency.get(words,0) #Start the count?
frequency[words] = count + 1 #Add one
fl = sorted(frequency, key=frequency.__getitem__, reverse = True)[:3] #Sort according to frequency
pdfFileObj.close() #Close the PDF
newtitle = ' '.join(map(str,fl, )).title() #Join the title list together
try:
print newtitle #Print the title
os.rename('{0}'.format(file), '{0}.pdf'.format(newtitle))#Rename the file
except:
print "Unable to Rename File"