How to remove an illegal character, (®) or editing text, in a PDF using Python, specifically PyMuPDF?
I've been trying for hours to remove a trademark symbol ® from about a thousand multi-page PDFs so that I can scrape the tables data into a csv. Camelot considers trademark symbols ® an illegal character apparently. So I need to remove them somehow.
I have tried several variations of code from Stackoverflow, Bard, and Chatgpt. But to no avail. I am now questioning if it is even possible to edit the text of a pdf, or with one which can handle a ®, as I have tried PyPDF2, and it too considers the ® an illegal character.
Is this possible? Yes? No? Perhaps?
Here is one of the versions I have been using which is largely a mix of stackoverflow and chatGPT:
import os
import fitz
def replace_text(content, replacements=dict()):
lines = content.splitlines()
result = ""
in_text = False
for line in lines:
if line == "BT":
in_text = True
elif line == "ET":
in_text = False
elif in_text:
cmd = line.strip()
if cmd.lower() in \['tj', 'tj\\n'\]:
replaced_line = line
for k, v in replacements.items():
replaced_line = replaced_line.replace(k, v)
result += replaced_line + "\\n"
else:
result += line + "\\n"
else:
result += line + "\\n"
return result
def remove_illegal_character(input_file, output_folder):
filename_base = os.path.splitext(os.path.basename(input_file))\[0\]
output_file = os.path.join(output_folder, filename_base + ".cleaned.pdf")
replacements = {"\\xae": ""}
doc = fitz.open(input_file)
for page_number in range(len(doc)):
page = doc\[page_number\]
blocks = page.getTextBlocks()
for b in blocks:
if "\\xae" in b\[4\]: # Text contains the illegal character
new_text = b\[4\].replace("\\xae", "")
page.updateText(fitz.Point(b\[0\], b\[1\]), new_text)
doc.save(output_file)
doc.close()
def main():
input_folder = r'C:\\path'
output_folder = r'C:\\path'
\# Iterate over all files in the input folder
for filename in os.listdir(input_folder):
if filename.endswith('.pdf'):
input_file = os.path.join(input_folder, filename)
\# Remove the illegal character from the PDF file
remove_illegal_character(input_file, output_folder)
if \__name_\_ == "\__main_\_":
main()
EDIT: Wow, yeah I am now seeing I may have perhaps jumped the gun a step.
Originally for Camelot I was using variations of:
def decode_name_object(name):
try:
return NameObject(name.decode('utf-8'))
except (UnicodeEncodeError, UnicodeDecodeError) as e:
# Name objects should represent irregular characters
# with a '#' followed by the symbol's hex number
if not pdf.strict:
warnings.warn("Illegal character in Name Object", utils.PdfReadWarning)
return NameObject(name)
else:
# raise utils.PdfReadError("Illegal character in Name Object")
return NameObject(name)
tables = camelot.read_pdf(r'file.pdf')
And I kept getting the same error: Illegal character in Name Object (b'/DocuSign\xae')