5

i'm trying to extract text with coordinates from a pdf file using PDFBox.

I mixed some methods/info found on internet (stackoverflow too), but the problem i have the coordinates doesnt'seems to be right. When i try to use coordinates for drawing a rectangle on top of tex, for example, the rect is painted elsewhere.

This is my code (please don't judge the style, was written very fast just to test)

TextLine.java

    import java.util.List;
    import org.apache.pdfbox.text.TextPosition;

    /**
     *
     * @author samue
     */
    public class TextLine {
        public List<TextPosition> textPositions = null;
        public String text = "";
    }

myStripper.java

    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.List;
    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.pdmodel.PDPage;
    import org.apache.pdfbox.text.PDFTextStripper;
    import org.apache.pdfbox.text.TextPosition;

    /*
     * To change this license header, choose License Headers in Project Properties.
     * To change this template file, choose Tools | Templates
     * and open the template in the editor.
     */

    /**
     *
     * @author samue
     */
    public class myStripper extends PDFTextStripper {
        public myStripper() throws IOException
        {
        }

        @Override
        protected void startPage(PDPage page) throws IOException
        {
            startOfLine = true;
            super.startPage(page);
        }

        @Override
        protected void writeLineSeparator() throws IOException
        {
            startOfLine = true;
            super.writeLineSeparator();
        }

        @Override
        public String getText(PDDocument doc) throws IOException
        {
            lines = new ArrayList<TextLine>();
            return super.getText(doc);
        }

        @Override
        protected void writeWordSeparator() throws IOException
        {
            TextLine tmpline = null;

            tmpline = lines.get(lines.size() - 1);
            tmpline.text += getWordSeparator();

            super.writeWordSeparator();
        }


        @Override
        protected void writeString(String text, List<TextPosition> textPositions) throws IOException
        {
            TextLine tmpline = null;

            if (startOfLine) {
                tmpline = new TextLine();
                tmpline.text = text;
                tmpline.textPositions = textPositions;
                lines.add(tmpline);
            } else {
                tmpline = lines.get(lines.size() - 1);
                tmpline.text += text;
                tmpline.textPositions.addAll(textPositions);
            }

            if (startOfLine)
            {
                startOfLine = false;
            }
            super.writeString(text, textPositions);
        }

        boolean startOfLine = true;
        public ArrayList<TextLine> lines = null;

    }

click event on AWT button

 private void jButton1MouseClicked(java.awt.event.MouseEvent evt) {                                      
    // TODO add your handling code here:
    try {
        File file = new File("C:\\Users\\samue\\Desktop\\mwb_I_201711.pdf");
        PDDocument doc = PDDocument.load(file);

        myStripper stripper = new myStripper();

        stripper.setStartPage(1); // fix it to first page just to test it
        stripper.setEndPage(1);
        stripper.getText(doc);

        TextLine line = stripper.lines.get(1); // the line i want to paint on

        float minx = -1;
        float maxx = -1;

        for (TextPosition pos: line.textPositions)
        {
            if (pos == null)
                continue;

            if (minx == -1 || pos.getTextMatrix().getTranslateX() < minx) {
                minx = pos.getTextMatrix().getTranslateX();
            }
            if (maxx == -1 || pos.getTextMatrix().getTranslateX() > maxx) {
                maxx = pos.getTextMatrix().getTranslateX();
            }
        }

        TextPosition firstPosition = line.textPositions.get(0);
        TextPosition lastPosition = line.textPositions.get(line.textPositions.size() - 1);

        float x = minx;
        float y = firstPosition.getTextMatrix().getTranslateY();
        float w = (maxx - minx) + lastPosition.getWidth();
        float h = lastPosition.getHeightDir();

        PDPageContentStream contentStream = new PDPageContentStream(doc, doc.getPage(0), PDPageContentStream.AppendMode.APPEND, false);

        contentStream.setNonStrokingColor(Color.RED);
        contentStream.addRect(x, y, w, h);
        contentStream.fill();
        contentStream.close();

        File fileout = new File("C:\\Users\\samue\\Desktop\\pdfbox.pdf");
        doc.save(fileout);
        doc.close();
    } catch (Exception ex) {

    }
}                                     

any suggestion? what am i doing wrong?

Tilman Hausherr
  • 17,731
  • 7
  • 58
  • 97
  • I haven't understood your code (must go to bed now). Be aware that in PDF, y = 0 is bottom, not top. Here's an example that may help to understand how to use the text extraction coordinates: https://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/util/DrawPrintTextLocations.java?view=markup&sortby=date – Tilman Hausherr Sep 06 '17 at 20:17
  • 1
    Have you tried using the `PDPageContentStream` constructor with another Boolean argument `resetContext` and setting it to `true`? – mkl Sep 06 '17 at 23:32
  • 1
    yes i know 0 is the bottom, this is the reason i used .getTextMatrix().getTranslateY() instead of getY() or getYDirAdj(). i've tryed to use resetContext but with no help. now i'm going to look at that source code, i will update, thank you – Samuele Diella Sep 07 '17 at 14:07
  • I just tested your code and it did work properly on a sample PDF (well, it did only cover the text from the baseline upwards but that is to be expected). Thus, there is something different about the PDFs in which the rectangles are *painted elsewhere*. I would assume that would be fixed if you used the `PDPageContentStream` I pointed towards in my previous comment. I cannot be sure, though, without a sample PDF for which you observed the issue. Thus, please share a sample PDF. – mkl Sep 07 '17 at 14:50
  • I tryed the code in your link. It's working, but is painting shapes on a PNG, not rectangles on the stream. I'm trying to convert shapes to rectangles, but i've some difficults ^^'' anyway the PDF is this one: https://download-a.akamaihd.net/files/media_mwb/b7/mwb_I_201711.pdf i suppose is something about fonts or fonts transformation – Samuele Diella Sep 07 '17 at 23:07

2 Answers2

5

This is just another case of the excessive PdfTextStripper coordinate normalization. Just like you I had thought that by using TextPosition.getTextMatrix() (instead of getX() and getY) one would get the actual coordinates, but no, even these matrix values have to be corrected (at least in PDFBox 2.0.x, I haven't checked 1.8.x) because the matrix is multiplied by a translation making the lower left corner of the crop box the origin.

Thus, in your case (in which the lower left of the crop box is not the origin), you have to correct the values, e.g. by replacing

        float x = minx;
        float y = firstPosition.getTextMatrix().getTranslateY();

by

        PDRectangle cropBox = doc.getPage(0).getCropBox();

        float x = minx + cropBox.getLowerLeftX();
        float y = firstPosition.getTextMatrix().getTranslateY() + cropBox.getLowerLeftY();

Instead of

without correction

you now get

with x,y correction

Obviously, though, you will also have to correct the height somewhat. This is due to the way the PdfTextStripper determines the text height:

    // 1/2 the bbox is used as the height todo: why?
    float glyphHeight = bbox.getHeight() / 2;

(from showGlyph(...) in LegacyPDFStreamEngine, the parent class of PdfTextStripper)

While the font bounding box indeed usually is too large, half of it often is not enough.

mkl
  • 90,588
  • 15
  • 125
  • 265
0

The following code worked for me:

    // Definition of font baseline, ascent, descent: https://en.wikipedia.org/wiki/Ascender_(typography)
    //
    // The origin of the text coordinate system is the top-left corner where Y increases downward.
    // TextPosition.getX(), getY() return the baseline.
    TextPosition firstLetter = textPositions.get(0);
    TextPosition lastLetter = textPositions.get(textPositions.size() - 1);

    // Looking at LegacyPDFStreamEngine.showGlyph(), ascender and descender heights are calculated like
    // CapHeight: https://stackoverflow.com/a/42021225/14731
    float ascent = firstLetter.getFont().getFontDescriptor().getAscent() / 1000 * lastLetter.getFontSize();
    Point topLeft = new Point(firstLetter.getX(), firstLetter.getY() - ascent);

    float descent = lastLetter.getFont().getFontDescriptor().getDescent() / 1000 * lastLetter.getFontSize();
    // Descent is negative, so we need to negate it to move downward.
    Point bottomRight = new Point(lastLetter.getX() + lastLetter.getWidth(),
        lastLetter.getY() - descent);

    float descender = lastLetter.getFont().getFontDescriptor().getDescent() / 1000 * lastLetter.getFontSize();
    // Descender height is negative, so we need to negate it to move downward
    Point bottomRight = new Point(lastLetter.getX() + lastLetter.getWidth(),
        lastLetter.getY() - descender);

In other words, we are creating a bounding box from the font's ascender down to its descender.

If you want to render these coordinates with the origin in the bottom-left corner, see https://stackoverflow.com/a/28114320/14731 for more details. You'll need to apply a transform like this:

contents.transform(new Matrix(1, 0, 0, -1, 0, page.getHeight()));
Gili
  • 86,244
  • 97
  • 390
  • 689
  • Sadly, ascent / descent etc are not always reliable. If you need exact bounds, see the cyan rectangles in the `DrawPrintTextLocations.java` example. – Tilman Hausherr Jul 19 '19 at 10:12
  • @TilmanHausherr That's excellent code. Why is this functionality not embedded directly in PDFBox's API? Meaning, why is it sitting in example code instead of being a method on (say) `TextPosition`? – Gili Jul 19 '19 at 16:20
  • I was thinking on adding it in the LegacyPDFStreamEngine code. However a lot of text extraction results are changing. And it isn't perfect either, it won't work for type3 fonts. And finally: so much things to do... like answering users questions, fixing bugs, etc. – Tilman Hausherr Jul 19 '19 at 18:10