I am struggling to remove text from a pdf file. I know this can be performed manually with PDF editors but I have a few PDF files to modify. The code I have so far is able to recognise all the text in a pdf file but dpes not remove th text when it is re-written as the output file.
here is the script I tried EDIT: I do not want to redact a pdf I want to remove the text from the PDF
import PyPDF2
# Open the PDF file in read-binary mode
with open('C:/inputput.pdf', 'rb') as pdf_file:
# Create a PdfFileReader object to read the PDF file
pdf_reader = PyPDF2.PdfFileReader(pdf_file)
# Get the first page of the PDF file
page = pdf_reader.getPage(0)
# Get the page's content as a string
page_content = page.extractText()
# Replace the text to be removed with an empty string
modified_content = page_content.replace('words to replace', '')
# If the entire text box was removed, remove the text box itself
if not modified_content.strip():
page.getContents().getObject().update({PyPDF2.utils.b_("Filter"): PyPDF2.utils.b_("FlateDecode"), PyPDF2.utils.b_("Length"): 0})
# Replace the page's content with the modified content
# Create a PdfFileWriter object to write the modified PDF to a new file
pdf_writer = PyPDF2.PdfFileWriter()
pdf_writer.addPage(page)
# Save the modified PDF to a new file
with open('output_file22.pdf', 'wb') as output_file:
pdf_writer.write(output_file)