I am trying to extract annotations from a PDF and then use that data to 'Cherry Pick' the annotations we require to display them in a clean version of the PDF using the Adobe Embed API. We are getting data fine from the PDF using PyMuPDF however when we apply the annotation to the clean PDF we are getting weird results.
First parse the Annotated PDF - Sandborn2003Annotated.pdf
Then using the Adobe Document Cloud Embed API - https://developer.adobe.com/document-services/docs/overview/pdf-embed-api/ apply the first Annotation in the stored json to a clean version of the file Sandborn2003.pdf
*** Edit *** This is the outputted file Sandborn 2003 (1).pdf
I am expecting to see the clean PDF take on the same annotation as the source
Screenshots of Before (Source) and After (Target)
Source Annotated Document
Target PDF after applying the annotation
My Environment
- macOs Ventura 13.2.1
- Python 3.10.9
- PyMuPDF version - 1.21.1
This my code
import fitz
import json
import sys
from datetime import datetime
if len(sys.argv) != 2:
print('Usage: python extractPDFAnnotations.py <filename>')
sys.exit(1)
filename = sys.argv[1]
doc = fitz.open('Sandborn2003Annotated.pdf')
annotations = []
for page_num, page in enumerate(doc):
for annot in page.annots():
annotation_data = {}
target_data = {}
selector_data = {}
# Common properties for all annotation types
annotation_data["@context"] = [
"https://www.w3.org/ns/anno.jsonld",
"https://comments.acrobat.com/ns/anno.jsonld",
]
annotation_data["id"] = annot.info.get("id", "")
annotation_data["type"] = "Annotation"
annotation_data["motivation"] = "commenting"
annotation_data["bodyValue"] = annot.info.get("content", "")
# Target properties
# Replace this with the actual source identifier
target_data["selector"] = selector_data
annotation_data["target"] = target_data
# Selector properties
selector_data["node"] = {"index": page_num}
selector_data["type"] = "AdobeAnnoSelector"
if annot.type[0] == 8: # Highlight annotation
all_coordinates = annot.vertices
highlights = []
for i in range(0, len(all_coordinates), 4):
quad = all_coordinates[i:i + 4]
highlight_coord = fitz.Quad(quad).rect
highlights.extend(
[highlight_coord.x0, highlight_coord.y0, highlight_coord.x1, highlight_coord.y1])
selector_data["quadPoints"] = highlights
# Adjust the opacity value as needed
selector_data["opacity"] = 0.4
selector_data["subtype"] = "highlight"
selector_data["boundingBox"] = [
annot.rect.x0,
annot.rect.y0,
annot.rect.x1,
annot.rect.y1,
]
# Adjust the stroke color as needed
selector_data["strokeColor"] = "#fccb00"
# Adjust the stroke width as needed
selector_data["strokeWidth"] = 3
# Creator properties
annotation_data["creator"] = {
# Replace this with the actual creator's name
"name": annot.info.get("title", ""),
"type": "Person",
}
annotations.append(annotation_data)
# Save the annotations to a JSON file
with open(filename, 'w') as f:
json.dump(annotations, f, indent=4)
print(f'Annotations saved to {filename}')