I'm currently writing a python script to convert pdfs to audiobooks and im trying to use a border to remove page numbers and other unwanted titles. Here is my current code for this (gTTS will be changed to a better library eventually):
import fitz
from gtts import gTTS
import os
import re
def extract_text_by_area(page, x0, y0, x1, y1):
return page.get_text("text", clip=(x0, y0, x1, y1))
def get_text_with_area_extraction(filepath: str, start_page: int, end_page: int, x0, y0, x1, y1) -> str:
with fitz.open(filepath) as doc:
extracted_text = ""
for page_num in range(start_page, end_page + 1):
page = doc[page_num]
extracted_text += extract_text_by_area(page, x0, y0, x1, y1).replace('\n', ' ').strip()
standardized_text = re.sub(r'\s+', ' ', extracted_text)
return standardized_text
def get_pdf_dimensions(pdf_path, page_num):
doc = fitz.open(pdf_path)
page = doc[page_num]
page_width = page.rect.width
page_height = page.rect.height
doc.close()
return page_width, page_height
def calculate_text_area(page_width, page_height, percentage):
border_x = page_width * percentage
border_y = page_height * percentage
x0 = border_x
y0 = page_height - border_y
x1 = page_width - border_x
y1 = border_y
return x0, y0, x1, y1
filepath = 'Fooled-by-Randomness-Role-of-Chance-in-Markets-and-Life-PROPER1.pdf'
start_page = int(input("Enter the starting page: ")) - 1
end_page = int(input("Enter the ending page: ")) - 1
page_num = start_page
page_width, page_height = get_pdf_dimensions(filepath, page_num)
percentage = float(input("Enter the percentage of the border to remove (e.g., 0.1 for 10%): "))
x0, y0, x1, y1 = calculate_text_area(page_width, page_height, percentage)
extracted_text = get_text_with_area_extraction(filepath, start_page, end_page, x0, y0, x1, y1)
print(extracted_text)
tts = gTTS(text=extracted_text, lang='en')
output_audio_path = 'extracted_text_audio.mp3'
tts.save(output_audio_path)
print(f"Text-to-speech audio saved as: {output_audio_path}")
Python 3.10.4. I was able to extract the text using get_text with no arguments but am struggling here. The inputs I was using are 27, 40, 0.05. The text returned from the get_text call is empty when the clip is passed. Any other suggestions are greatly welcomed as well :)