You can create CustomPDFTextStripper
which extends PDFTextStripper
and override protected void writeString(String text, List<TextPosition> textPositions)
. In this overriden method you need to split textPositions
by the word separator to get List<TextPosition>
for each word. After that you can join each character and compute bounding box.

Full example below which contains also drawing of the resulting bounding boxes.
package com.example;
import lombok.Value;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.junit.Ignore;
import org.junit.Test;
import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
public class PdfBoxTest {
private static final String BASE_DIR_PATH = "C:\\Users\\Milan\\50330484";
private static final String INPUT_FILE_PATH = "input.pdf";
private static final String OUTPUT_IMAGE_PATH = "output.jpg";
private static final String OUTPUT_BBOX_IMAGE_PATH = "output-bbox.jpg";
private static final float FROM_72_TO_300_DPI = 300.0f / 72.0f;
@Test
public void run() throws Exception {
pdfToImage();
drawBoundingBoxes();
}
@Ignore
@Test
public void pdfToImage() throws IOException {
PDDocument document = PDDocument.load(new File(BASE_DIR_PATH, INPUT_FILE_PATH));
PDFRenderer renderer = new PDFRenderer(document);
BufferedImage image = renderer.renderImageWithDPI(0, 300);
ImageIO.write(image, "JPEG", new File(BASE_DIR_PATH, OUTPUT_IMAGE_PATH));
}
@Ignore
@Test
public void drawBoundingBoxes() throws IOException {
PDDocument document = PDDocument.load(new File(BASE_DIR_PATH, INPUT_FILE_PATH));
List<WordWithBBox> words = getWords(document);
draw(words);
}
private List<WordWithBBox> getWords(PDDocument document) throws IOException {
CustomPDFTextStripper customPDFTextStripper = new CustomPDFTextStripper();
customPDFTextStripper.setSortByPosition(true);
customPDFTextStripper.setStartPage(0);
customPDFTextStripper.setEndPage(1);
Writer writer = new OutputStreamWriter(new ByteArrayOutputStream());
customPDFTextStripper.writeText(document, writer);
List<WordWithBBox> words = customPDFTextStripper.getWords();
return words;
}
private void draw(List<WordWithBBox> words) throws IOException {
BufferedImage bufferedImage = ImageIO.read(new File(BASE_DIR_PATH, OUTPUT_IMAGE_PATH));
Graphics2D graphics = bufferedImage.createGraphics();
graphics.setColor(Color.GREEN);
List<Rectangle> rectangles = words.stream()
.map(word -> new Rectangle(word.getX(), word.getY(), word.getWidth(), word.getHeight()))
.collect(Collectors.toList());
rectangles.forEach(graphics::draw);
graphics.dispose();
ImageIO.write(bufferedImage, "JPEG", new File(BASE_DIR_PATH, OUTPUT_BBOX_IMAGE_PATH));
}
private class CustomPDFTextStripper extends PDFTextStripper {
private final List<WordWithBBox> words;
public CustomPDFTextStripper() throws IOException {
this.words = new ArrayList<>();
}
public List<WordWithBBox> getWords() {
return new ArrayList<>(words);
}
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
String wordSeparator = getWordSeparator();
List<TextPosition> wordTextPositions = new ArrayList<>();
for (TextPosition textPosition : textPositions) {
String str = textPosition.getUnicode();
if (wordSeparator.equals(str)) {
if (!wordTextPositions.isEmpty()) {
this.words.add(createWord(wordTextPositions));
wordTextPositions.clear();
}
} else {
wordTextPositions.add(textPosition);
}
}
super.writeString(text, textPositions);
}
private WordWithBBox createWord(List<TextPosition> wordTextPositions) {
String word = wordTextPositions.stream()
.map(TextPosition::getUnicode)
.collect(Collectors.joining());
int minX = Integer.MAX_VALUE;
int minY = Integer.MAX_VALUE;
int maxX = Integer.MIN_VALUE;
int maxY = Integer.MIN_VALUE;
for (TextPosition wordTextPosition : wordTextPositions) {
minX = Math.min(minX, from72To300Dpi(wordTextPosition.getXDirAdj()));
minY = Math.min(minY, from72To300Dpi(wordTextPosition.getYDirAdj() - wordTextPosition.getHeightDir()));
maxX = Math.max(maxX, from72To300Dpi(wordTextPosition.getXDirAdj() + wordTextPosition.getWidthDirAdj()));
maxY = Math.max(maxY, from72To300Dpi(wordTextPosition.getYDirAdj()));
}
return new WordWithBBox(word, minX, minY, maxX - minX, maxY - minY);
}
}
private int from72To300Dpi(float f) {
return Math.round(f * FROM_72_TO_300_DPI);
}
@Value
private class WordWithBBox {
private final String word;
private final int x;
private final int y;
private final int width;
private final int height;
}
}
Note:
If you are interested in other options, you can check also Poppler
PDF to image
pdftoppm -r 300 -jpeg input.pdf output
Generate an XHTML file containing bounding box information for each word in the file.
pdftotext -r 300 -bbox input.pdf