I am trying to replace some text in python and have the following code which is updating the content of the PDF correctly till the time it is in memory, but overwrites it with original content on writing to a file :
def replace_text(content, replacements = dict()):
content=content.replace("<NAME>","Test")
return content
def process_data(content,text, replacements):
data = content.get_data()
decoded_data = data.decode('utf-8')
replaced_data = replace_text(text, replacements)
encoded_data = replaced_data.encode('utf-8')
# print(encoded_data)
if content.decoded_self is not None:
content.decoded_self.set_data(encoded_data)
else:
content.set_data(replaced_data)
def get_pdf_encoding(pdf_reader):
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
font_objects = page['/Resources']['/Font']
for font_key in font_objects.keys():
font_obj = font_objects[font_key]
encoding = font_obj.get('/Encoding')
if encoding:
return encoding
return None
if __name__ == "__main__":
in_file ="Certificate.pdf"
filename_base = in_file.replace(os.path.splitext(in_file)[1], "")
# Provide replacements list that you need here
replacements = {
"<NAME>": "John Doe",
# Add other placeholders and their replacements here
}
pdf = PdfReader(in_file)
encoding=get_pdf_encoding(pdf)
print(encoding)
for page_number in range(0, len(pdf.pages)):
page = pdf.pages[page_number]
text = page.extract_text()
contents=page.get_contents()
# process_data(contents,text, replacements,encoding)
if isinstance(contents, DecodedStreamObject) or isinstance(contents, EncodedStreamObject):
process_data(contents,text, replacements)
elif len(contents) > 0:
for obj in contents:
if isinstance(obj, DecodedStreamObject) or isinstance(obj, EncodedStreamObject):
streamObj = obj.getObject()
process_data(streamObj,text, replacements)
page[NameObject("/Contents")] = contents
print(page.get_contents().get_data())
writer = PdfWriter()
writer.add_page(page)
with open("updatedcertificate5.pdf", 'wb') as out_file:
writer.write(out_file)
When I print the data in page at the end of this code, I am getting the updated value, however, after writing the data to PDF, it reverts back to the original pdf content. Can someone highlight what might be the issue or what I can do differently to get this working? Here is a screenshot of my file directory: