I need to automate cleaning procedure in text loaded from .pdf.
this is currently what the issue is being about and heres the code i'm using to clean pdf
def clean_text(text):
# Remove additional whitespaces and newlines using regex
cleaned_text = re.sub(r'\s+', ' ', text.strip())
# Remove Unicode characters from the text
pattern = re.compile(r'[^\x00-\x7F]+')
cleaned_text = re.sub(pattern, '', cleaned_text)
# Remove spaces between characters after closing parenthesis, period, or comma
cleaned_text = re.sub(r'((?<=[\)\.,])\s{2})|((?<=\()\s{2})', ' ', cleaned_text)
cleaned_text =
return cleaned_text
# Directory containing the PDF files
pdf_directory = r"C:\Users\MartinJunakovic\Downloads\CVs\CVs"
# Initialize the overall text variable
all_pdf_text = ""
# Iterate over each PDF file in the directory
for file_name in os.listdir(pdf_directory):
if file_name.endswith(".pdf"):
# Get the full file path
file_path = os.path.join(pdf_directory, file_name)
# Load the PDF file
doc = fitz.open(file_path)
# Iterate over each page in the PDF
for page in doc:
# Get the page's text content
text = page.get_text("text")
# Clean the text by removing additional whitespaces and newlines
cleaned_text = clean_text(text)
# Append the preprocessed text to the overall text variable
all_pdf_text += cleaned_text
# Close the PDF file
doc.close()
# Print the combined preprocessed text
print(all_pdf_text)
So for instance one particular .pdf is giving me output like this:
s,(specialising(in( projects(and(programmes(management((MSP(and(PRINCE2).(Demonstrated(ability(to(successfully(deliver( complex(projects(and(drive(sustainable(continuous(improvement(through(change(management.( Led(transformational(change(initiatives,(conducting(business(process(and(efficiency(reviews(in(diverse( institutions,(including(local(government.(Expertise(in(organisational(design,(service(redesign,(and( procurement(strategies(to(secure(contracts(that(aligns(with(objectives.
or there ought to be anomalies like "O B J E C T I V E", "E X P E R I E N C E" , "S U M M A R Y"...
i've trying locating anomalies by whitespaces and building a dictionary for them, unfortunately it didn't work well. tried different re.sub methods.