so i need to build a model that extract data from pdf files(Resumes) (OCR) so i collected bunch of pdf files and i need to convert them to suitable form for OCR and i am lost. i tried converting them to a csv file with Regex but no use the csv file is empty if there's anything you guys can help me with that would be great. Thank you in advance! this is my code :
import pdfplumber
import os
import re
import pandas as pd
def clean_text(text):
cleaned_text = re.sub(r"[^a-zA-Z0-9\s@.-]", "", text) # Remove special characters except alphanumeric, @, and -
cleaned_text = re.sub(r"\s+", " ", cleaned_text) # Remove extra spaces
return cleaned_text.strip()
def extract_data_from_resume(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text()
cleaned_text = clean_text(text)
name_pattern = r"Name:\s*(.*)"
name_match = re.search(name_pattern, cleaned_text, re.IGNORECASE)
name = name_match.group(1) if name_match else ""
email_pattern = r"Email:\s*([^\s@]+@[^\s@]+\.[^\s@]+)"
email_match = re.search(email_pattern, cleaned_text, re.IGNORECASE)
email = email_match.group(1) if email_match else ""
phone_pattern = r"Phone:\s*(.*)"
phone_match = re.search(phone_pattern, cleaned_text, re.IGNORECASE)
phone = phone_match.group(1) if phone_match else ""
return name, email, phone
def convert_pdf_to_csv(directory, output_csv_path):
data = []
for filename in os.listdir(directory):
if filename.endswith(".pdf"):
pdf_path = os.path.join(directory, filename)
name, email, phone = extract_data_from_resume(pdf_path)
data.append({"Name": name, "Email": email, "Phone": phone})
df = pd.DataFrame(data)
df.to_csv(output_csv_path, index=False)
# Example usage
directory = 'D:/data/test'
output_csv_path = 'D:/data/test/test.csv'
convert_pdf_to_csv(directory, output_csv_path)