I am developing a program that will add annotations over specific words that are found in a document. The annotations appear to be the right size, the X coordinates are pretty close (not perfect) but the Y coordinate is shifted considerably. My assumption here is that the coordinate systems are different but it's not clear how to adjust the Y coordinates to place it correctly.
Here is the code, the most important method is placeAnnotations()
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
public class Sample {
private static final float FROM_72_TO_300_DPI = 1;
public static void main(String[] args) throws FileNotFoundException, IOException {
PDDocument doc = PDDocument.load(new File("test.pdf"));
try {
//insert new page
PDPage page = (PDPage) doc.getDocumentCatalog().getPages().get(0);
List<WordBBox> words = getWords(doc, 0);
placeAnnotations(words, page);
doc.save("test-out.pdf");
} finally {
doc.close();
}
}
public static List<WordBBox> getWords(PDDocument document, int page) throws IOException {
CustomPDFTextStripper customPDFTextStripper = new CustomPDFTextStripper();
customPDFTextStripper.setSortByPosition(true);
customPDFTextStripper.setStartPage(page);
customPDFTextStripper.setEndPage(page + 1);
Writer writer = new OutputStreamWriter(new ByteArrayOutputStream());
customPDFTextStripper.writeText(document, writer);
List<WordBBox> words = customPDFTextStripper.getWords();
return words;
}
private static void placeAnnotations(List<WordBBox> words, PDPage page) throws IOException {
List<PDAnnotation> annotations = page.getAnnotations();
for (WordBBox word : words) {
if (word.word.toLowerCase().equals("simple") || word.word.toLowerCase().equals("pdf")) {
//generate instanse for annotation
PDAnnotationTextMarkup txtMark = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
//set the rectangle
PDRectangle position = new PDRectangle();
position.setLowerLeftX(word.x);
position.setLowerLeftY(word.y + word.height);
position.setUpperRightX(word.x + word.width);
position.setUpperRightY(word.y);
txtMark.setRectangle(position);
//set the quadpoint
float[] quads = new float[8];
//x1,y1
quads[0] = position.getLowerLeftX();
quads[1] = position.getUpperRightY();
//x2,y2
quads[2] = position.getUpperRightX();
quads[3] = quads[1];
//x3,y3
quads[4] = quads[0];
quads[5] = position.getLowerLeftY();
//x4,y4
quads[6] = quads[2];
quads[7] = quads[5];
txtMark.setRectangle(position);
txtMark.setQuadPoints(quads);
txtMark.setAnnotationName("My annotation - " + word.word);
txtMark.setTitlePopup("For '" + word.word + "'");
txtMark.setContents("Highlighted since it's important for word '" + word.word + "'");
txtMark.setRichContents("Here is some rich content for the word '" + word.word + "'");
PDColor blue = new PDColor(new float[]{0, 0, 1}, PDDeviceRGB.INSTANCE);
txtMark.setColor(blue);
txtMark.setInvisible(false);
txtMark.setNoView(false);
txtMark.setNoZoom(false);
txtMark.setLocked(false);
txtMark.setHidden(false);
txtMark.constructAppearances();
annotations.add(txtMark);
}
}
page.setAnnotations(annotations);
}
private static class CustomPDFTextStripper extends PDFTextStripper {
private final List<WordBBox> words;
public CustomPDFTextStripper() throws IOException {
this.words = new ArrayList<>();
}
public List<WordBBox> getWords() {
return new ArrayList<>(words);
}
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
String wordSeparator = getWordSeparator();
List<TextPosition> wordTextPositions = new ArrayList<>();
for (TextPosition textPosition : textPositions) {
String str = textPosition.getUnicode();
if (wordSeparator.equals(str)) {
if (!wordTextPositions.isEmpty()) {
this.words.add(createWord(wordTextPositions));
wordTextPositions.clear();
}
} else {
wordTextPositions.add(textPosition);
}
}
super.writeString(text, textPositions);
}
private WordBBox createWord(List<TextPosition> wordTextPositions) {
String word = wordTextPositions.stream()
.map(TextPosition::getUnicode)
.collect(Collectors.joining());
int minX = Integer.MAX_VALUE;
int minY = Integer.MAX_VALUE;
int maxX = Integer.MIN_VALUE;
int maxY = Integer.MIN_VALUE;
for (TextPosition wordTextPosition : wordTextPositions) {
minX = Math.min(minX, from72To300Dpi(wordTextPosition.getXDirAdj()));
minY = Math.min(minY, from72To300Dpi(wordTextPosition.getYDirAdj() - wordTextPosition.getHeightDir()));
maxX = Math.max(maxX, from72To300Dpi(wordTextPosition.getXDirAdj() + wordTextPosition.getWidthDirAdj()));
maxY = Math.max(maxY, from72To300Dpi(wordTextPosition.getYDirAdj()));
}
return new WordBBox(word, minX, minY, maxX - minX, maxY - minY);
}
}
private static int from72To300Dpi(float f) {
return Math.round(f * FROM_72_TO_300_DPI);
}
private static class WordBBox {
private String word = null;
private int x = 0;
private int y = 0;
private int width = 0;
private int height = 0;
public WordBBox(String word, int x, int y, int width, int height) {
this.word = word;
this.x = x;
this.y = y;
this.width = width;
this.height = height;
}
}
}
The PDF I am using to test is here: http://www.africau.edu/images/default/sample.pdf
As you can see in this screenshot the annotations are at the bottom of the page, when they should be placed over the words 'Simple' and 'PDF'