2

While reading the Word file element by element using the Apache POI library, I need to have a track of current page number and current line number where the element is at? I have used the below code to get element by element:

import java.io.FileInputStream;
import java.io.IOException;
import org.apache.poi.xwpf.usermodel.*;

public class ReadWordFile {

    public static void main(String[] args) {
        try {
            // Open the Word file using an InputStream
            FileInputStream fis = new FileInputStream("file.docx");
            // Create an XWPFDocument object to represent the Word file
            XWPFDocument document = new XWPFDocument(fis);
            // Iterate through the elements in the Word file
            for (IBodyElement element : document.getBodyElements()) {
                if (element instanceof XWPFParagraph) {
                    // If the element is a paragraph, retrieve the text
                    XWPFParagraph paragraph = (XWPFParagraph) element;
                    String text = paragraph.getText();
                    System.out.println(text);
                } else if (element instanceof XWPFTable) {
                    // If the element is a table, iterate through the rows and cells
                    XWPFTable table = (XWPFTable) element;
                    for (XWPFTableRow row : table.getRows()) {
                        for (XWPFTableCell cell : row.getTableCells()) {
                            String text = cell.getText();
                            System.out.println(text);
                        }
                    }
                } else if (element instanceof XWPFRun) {
                    // If the element is an image, retrieve it and save it to a file
                    XWPFRun run = (XWPFRun) element;
                    if (run.getEmbeddedPictures() != null && run.getEmbeddedPictures().size() > 0) {
                        XWPFPicture picture = run.getEmbeddedPictures().get(0);
                        byte[] data = picture.getPictureData().getData();
                    }
                }
            }
            // Close the input stream
            fis.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}
trashgod
  • 203,806
  • 29
  • 246
  • 1,045
lemon chow
  • 303
  • 8
  • 3
    You are reading paragraph by paragraph, there are no line numbers in paragraphs. How many lines or pages this will give depends on page size, font size, etc. – PMF Feb 13 '23 at 05:57
  • 2
    Render to a page-based format like PDF, then check? (Word DOC / DOCX are not page based) – Gagravarr Feb 13 '23 at 09:14
  • Is it possible to read a word file element by element including paragraphs, tables and images, shapes in the order in which they are and having a track of page number using any other library? – lemon chow Feb 15 '23 at 10:58

0 Answers0