0

I'm a rookie, really. I'm building my first project (if I can finish it).

I want to extract PDF text with formatting and location, and then write to .docx file. I checked the PDFBox API documentation, but I'm not sure if I want to get the location of the text, then should I traverse the rows? Or traverse the characters? I studied these three carefully.

Text coordinates when stripping from PDFBox

Get font of each line using PDFBox

How to extract font styles of text contents using pdfbox?

And here is my DEMO:


import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;

import java.io.IOException;
import java.util.List;

public class PDFTextExtractor extends PDFTextStripper {

    /**
     * Instantiate a new PDFTextStripper object.
     *
     * @throws IOException If there is an error loading the properties.
     */
    public PDFTextExtractor() throws IOException {
    }

    String prevFont = "";
    @Override
    protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
        StringBuilder sb = new StringBuilder();

        for (TextPosition position : textPositions){
            String font = position.getFont().getName();
            float x = position.getX();
            float y = position.getY();
            float fontSize = position.getFontSizeInPt();

            if (font != null && !font.equals(prevFont)){
                sb.append("[").append(font.split("-")[0]).append("+").append(font.split("-")[1]).append("+").append(fontSize).append("]");
                prevFont = font;
            }
           sb.append(position.getUnicode());
        }
        writeString(sb.toString());
    }

    @Override
    public String getText(PDDocument doc) throws IOException {
        return super.getText(doc);
    }
}

And i calling it like here:

FileOutputStream outputStream = new FileOutputStream(EXPORT_PATH + file.getName().split("\\.")[0] + ".docx");
        try (PDDocument originalPDF = PDDocument.load(file);
             XWPFDocument doc = new XWPFDocument()) {
            //get All pages
            PDPageTree pageList = originalPDF.getDocumentCatalog().getPages();
            for (PDPage page : pageList){
                //Parse Content
                PDFTextStripper stripper = new PDFTextExtractor();
                stripper.setSortByPosition(true);
                String ss = stripper.getText(originalPDF);

                System.out.println(ss);
                //Write Content
                XWPFParagraph paragraph = doc.createParagraph();
                XWPFRun run = paragraph.createRun();
                run.setText(ss);
                run.addBreak(BreakType.PAGE);
            }
            doc.write(outputStream);
            originalPDF.close();
            outputStream.close();

        }
Tilman Hausherr
  • 17,731
  • 7
  • 58
  • 97
HappyKoala
  • 191
  • 1
  • 11

0 Answers0