0

I am developing a program that will add annotations over specific words that are found in a document. The annotations appear to be the right size, the X coordinates are pretty close (not perfect) but the Y coordinate is shifted considerably. My assumption here is that the coordinate systems are different but it's not clear how to adjust the Y coordinates to place it correctly.

Here is the code, the most important method is placeAnnotations()

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;

import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;

public class Sample {

    private static final float FROM_72_TO_300_DPI = 1;

    public static void main(String[] args) throws FileNotFoundException, IOException {
        PDDocument doc = PDDocument.load(new File("test.pdf"));

        try {
            //insert new page
            PDPage page = (PDPage) doc.getDocumentCatalog().getPages().get(0);

            List<WordBBox> words = getWords(doc, 0);

            placeAnnotations(words, page);

            doc.save("test-out.pdf");

        } finally {
            doc.close();
        }
    }


    public static List<WordBBox> getWords(PDDocument document, int page) throws IOException {

        CustomPDFTextStripper customPDFTextStripper = new CustomPDFTextStripper();
        customPDFTextStripper.setSortByPosition(true);
        customPDFTextStripper.setStartPage(page);
        customPDFTextStripper.setEndPage(page + 1);

        Writer writer = new OutputStreamWriter(new ByteArrayOutputStream());
        customPDFTextStripper.writeText(document, writer);

        List<WordBBox> words = customPDFTextStripper.getWords();

        return words;
    }

    private static void placeAnnotations(List<WordBBox> words, PDPage page) throws IOException {
        
        List<PDAnnotation> annotations = page.getAnnotations();
        for (WordBBox word : words) {


            if (word.word.toLowerCase().equals("simple") || word.word.toLowerCase().equals("pdf")) {
                //generate instanse for annotation
                PDAnnotationTextMarkup txtMark = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);

                //set the rectangle
                PDRectangle position = new PDRectangle();
                position.setLowerLeftX(word.x);
                position.setLowerLeftY(word.y + word.height);
                position.setUpperRightX(word.x + word.width);
                position.setUpperRightY(word.y);
                txtMark.setRectangle(position);


                //set the quadpoint
                float[] quads = new float[8];
                //x1,y1
                quads[0] = position.getLowerLeftX();
                quads[1] = position.getUpperRightY();
                //x2,y2
                quads[2] = position.getUpperRightX();
                quads[3] = quads[1];
                //x3,y3
                quads[4] = quads[0];
                quads[5] = position.getLowerLeftY();
                //x4,y4
                quads[6] = quads[2];
                quads[7] = quads[5];

                txtMark.setRectangle(position);
                txtMark.setQuadPoints(quads);
                txtMark.setAnnotationName("My annotation - " + word.word);
                txtMark.setTitlePopup("For '" + word.word + "'");
                txtMark.setContents("Highlighted since it's important for word '" + word.word + "'");
                txtMark.setRichContents("Here is some rich content for the word '" + word.word + "'");
                PDColor blue = new PDColor(new float[]{0, 0, 1}, PDDeviceRGB.INSTANCE);
                txtMark.setColor(blue);
                txtMark.setInvisible(false);
                txtMark.setNoView(false);
                txtMark.setNoZoom(false);
                txtMark.setLocked(false);
                txtMark.setHidden(false);
                txtMark.constructAppearances();

                annotations.add(txtMark);

            }

        }

        page.setAnnotations(annotations);

    }

    private static class CustomPDFTextStripper extends PDFTextStripper {

        private final List<WordBBox> words;

        public CustomPDFTextStripper() throws IOException {
            this.words = new ArrayList<>();
        }

        public List<WordBBox> getWords() {
            return new ArrayList<>(words);
        }

        @Override
        protected void writeString(String text, List<TextPosition> textPositions) throws IOException {

            String wordSeparator = getWordSeparator();
            List<TextPosition> wordTextPositions = new ArrayList<>();

            for (TextPosition textPosition : textPositions) {
                String str = textPosition.getUnicode();
                if (wordSeparator.equals(str)) {
                    if (!wordTextPositions.isEmpty()) {
                        this.words.add(createWord(wordTextPositions));
                        wordTextPositions.clear();
                    }
                } else {
                    wordTextPositions.add(textPosition);
                }
            }

            super.writeString(text, textPositions);
        }

        private WordBBox createWord(List<TextPosition> wordTextPositions) {

            String word = wordTextPositions.stream()
                    .map(TextPosition::getUnicode)
                    .collect(Collectors.joining());

            int minX = Integer.MAX_VALUE;
            int minY = Integer.MAX_VALUE;
            int maxX = Integer.MIN_VALUE;
            int maxY = Integer.MIN_VALUE;

            for (TextPosition wordTextPosition : wordTextPositions) {

                minX = Math.min(minX, from72To300Dpi(wordTextPosition.getXDirAdj()));
                minY = Math.min(minY, from72To300Dpi(wordTextPosition.getYDirAdj() - wordTextPosition.getHeightDir()));
                maxX = Math.max(maxX, from72To300Dpi(wordTextPosition.getXDirAdj() + wordTextPosition.getWidthDirAdj()));
                maxY = Math.max(maxY, from72To300Dpi(wordTextPosition.getYDirAdj()));
            }

            return new WordBBox(word, minX, minY, maxX - minX, maxY - minY);
        }
    }

    private static int from72To300Dpi(float f) {
        return Math.round(f * FROM_72_TO_300_DPI);
    }

    private static class WordBBox {
        private String word = null;
        private int x = 0;
        private int y = 0;
        private int width = 0;
        private int height = 0;

        public WordBBox(String word, int x, int y, int width, int height) {
            this.word = word;
            this.x = x;
            this.y = y;
            this.width = width;
            this.height = height;
        }

    }
}

The PDF I am using to test is here: http://www.africau.edu/images/default/sample.pdf

As you can see in this screenshot the annotations are at the bottom of the page, when they should be placed over the words 'Simple' and 'PDF'

enter image description here

Casey Jordan
  • 1,204
  • 1
  • 20
  • 39
  • Instead of `getXDirAdj()` and `getYDirAdj()` use `getTextMatrix().getTranslateX()` and `getTextMatrix().getTranslateY()`, and even then there sometimes is a need for correction, read [this Q&A](https://stackoverflow.com/q/46080131/1729265). – mkl Feb 15 '21 at 18:41
  • Cool thank you that looks promising – Casey Jordan Feb 19 '21 at 23:41

0 Answers0