I tried to extract grant payable org details from PDFs which have fixed format but page numbers are varying. I have spent a lot of time with libraries like PYPDF2, PyMuPDF, Tabula, SpaCy, NLTK, etc. but still no luck. It will be a great help if someday suggest or provide something I am attaching the 2 sample screenshots and the code that I have tried.
import os
import fitz # PyMuPDF
import re
import spacy
from nltk import sent_tokenize, word_tokenize, pos_tag
# Function to find the content page number and financial report starting page number
def find_starting_pages(pdf_path, content_keywords, report_keywords):
pdf_document = fitz.open(pdf_path)
content_page_number = None
report_start_page = None
content_lines = []
for page_num, page in enumerate(pdf_document):
page_text = page.get_text("text").lower()
if any(keyword in page_text for keyword in content_keywords):
content_page_number = page_num + 1
content_lines = page_text.split('\n')
break
for line_num, line in enumerate(content_lines):
if ("notes" in line) and any(keyword in line for keyword in report_keywords):
next_line = content_lines[line_num + 1].strip() if line_num + 1 < len(content_lines) else ""
prev_line = content_lines[line_num - 1].strip() if line_num > 0 else ""
next_first_chars = next_line[:2]
prev_first_chars = prev_line[:2]
# Try to convert the first 2 characters of the next line into an integer
try:
next_start_page = int(next_first_chars)
except ValueError:
# If not a valid 2-digit number, try converting just the first character into an integer
try:
next_start_page = int(next_first_chars[0])
except ValueError:
# If not a valid 1-digit number, try the same for the previous line
try:
prev_start_page = int(prev_first_chars)
except ValueError:
try:
prev_start_page = int(prev_first_chars[0])
except ValueError:
report_start_page = "Not Found"
continue
# Assign the found starting page number if available
if 'next_start_page' in locals():
report_start_page = next_start_page
elif 'prev_start_page' in locals():
report_start_page = prev_start_page
break
total_pages = pdf_document.page_count
pdf_document.close()
return content_page_number, total_pages, report_start_page, content_lines
# Specify the folder containing PDF files and keywords
pdf_folder = "pdfs"
content_keywords = ["contents", "content"]
report_keywords = ["financial", "accounts", "account"]
# Loop through PDF files in the folder
for pdf_file in os.listdir(pdf_folder):
if pdf_file.endswith(".pdf"):
pdf_path = os.path.join(pdf_folder, pdf_file)
try:
content_page_num, total_pages, report_start_page, content_lines = find_starting_pages(pdf_path,
content_keywords,
report_keywords)
print(f"PDF File: {pdf_file}")
if content_page_num is not None:
print(f"Content Page #: {content_page_num}")
else:
print("Content Page #: Not Found")
print(f"Total PDF Pages: {total_pages}")
if report_start_page is not None:
print(f"Financial Report Starting Page #: {report_start_page}")
else:
print("Financial Report Starting Page #: Not Found")
print("=" * 50)
except (ValueError, IndexError):
print(f"Cannot Copy Text: {pdf_file}")
List of Keywords after those keywords if only ORG name & amount are present then it is the beginning mark.
Donations payable Charitable activities Grants payable Analysis of Grants Grants payable grants made payable Beneficiary Charitable Expenditure Donations to institutions Grant funding Grants made Grants paid Grant-making Grant making Grants making Grants accrued Name of organisation grants made to institutions