How Can I extract the text for each bookmarked section of a pdf book in Python using PyPDF2 and split it into html files based on the list of bookmarks?
I've been working with this code but I can't quite make it work:
import PyPDF2
from bs4 import BeautifulSoup
# Open the PDF file
pdf_file = PyPDF2.PdfReader(open("book.pdf", "rb"))
# Get the bookmarks from the PDF file
bookmarks = pdf_file.outline
# Create a folder to store the HTML files
import os
folder = "html_pages"
if not os.path.exists(folder):
os.makedirs(folder)
# Iterate over the bookmarks
for i, bookmark in enumerate(bookmarks):
# Extract the text from the page associated with the bookmark
page_number = bookmark[0]
page = pdf_file.pages[page_number - 1]
content = page.extract_text()
# Convert the text to HTML
soup = BeautifulSoup(content, "html.parser")
html_content = str(soup)
# Write the HTML content to a file
file_path = os.path.join(folder, f"{i}.html")
with open(file_path, "w") as f:
f.write(html_content)