3

I have been trying to segment a docx document to multiple documents based on a predefined criteria. following is my approach to cut it to paragraphs

        try {
        FileInputStream in = new FileInputStream(file);
        XWPFDocument doc = new XWPFDocument(in);
        List<XWPFParagraph> paragraphs = doc.getParagraphs();
        for (int idx = 0; idx < paragraphs.size(); idx++) {
            XWPFDocument outputDocument = new XWPFDocument();
            createParagraphInAnotherDocument(outputDocument, paragraphs.get(idx).getText());
            String fullPath = String.format("./content/output/%1$s_%2$s_%3$04d.docx", FileUtils.getFileName(file), getName(), idx);
            FileOutputStream outputStream = new FileOutputStream(fullPath);
            outputDocument.write(outputStream);
            outputDocument.close();

            doc.close();
        }

    } catch (IOException e) {
        e.printStackTrace();
    }

While I am able to extract paragraphs with the code above, I can't find a way to extract pages. My understanding is that pages in word are render concern, and it happens in the runtime in the word application.

WiredCoder
  • 916
  • 1
  • 11
  • 39
  • Hope this will work for you [click here](http://stackoverflow.com/questions/25092384/how-to-split-a-doc-into-several-doc-using-java-poi) answered by @DenisFLASH – yash Jun 27 '16 at 07:25
  • Thanks @yash, This one of the threads that I have found, still only segments by paragraph not by page, and even with that it does not generate neither a styled nor full paragraphs – WiredCoder Jun 27 '16 at 07:25

1 Answers1

5

As far as I can see, the only way to do this is by interrogating the DOM model for the Word doc, and then determining how many paragraphs there are on each page. Below is a possible solution to the problem (it only works if the pages are explicitly separated by page breaks)

public static void main(String[] args) {
    XWPFDocument doc = null;

    try {
        //Input Word Document
        File file = new File("C:/TestDoc.docx");
        FileInputStream in = new FileInputStream(file);
        doc = new XWPFDocument(in);

        //Determine how many paragraphs per page
        List<Integer> paragraphCountList = getParagraphCountPerPage(doc);

        if (paragraphCountList != null && paragraphCountList.size() > 0) {
            int docCount = 0;
            int startIndex = 0;
            int endIndex = paragraphCountList.get(0);

            //Loop through the paragraph counts for each page
            for (int i=0; i < paragraphCountList.size(); i++) {
                XWPFDocument outputDocument = new XWPFDocument();

                List<XWPFParagraph> paragraphs = doc.getParagraphs();
                List<XWPFParagraph> pageParagraphs = new ArrayList<XWPFParagraph>();

                if (paragraphs != null && paragraphs.size() > 0) {
                    //Get the paragraphs from the input Word document
                    for (int j=startIndex; j < endIndex; j++) {
                        if (paragraphs.get(j) != null) {
                            pageParagraphs.add(paragraphs.get(j));
                        }
                    }

                    //Set the start and end point for the next set of paragraphs
                    startIndex = endIndex;

                    if (i < paragraphCountList.size()-2) {
                        endIndex = endIndex + paragraphCountList.get(i+1);
                    } else {
                        endIndex = paragraphs.size()-1;
                    }


                    //Create a new Word Doc with the paragraph subset
                    createPageInAnotherDocument(outputDocument, pageParagraphs);

                    //Write the file
                    String outputPath = "C:/TestDocOutput"+docCount+".docx";
                    FileOutputStream outputStream = new FileOutputStream(outputPath);
                    outputDocument.write(outputStream);
                    outputDocument.close();

                    docCount++;
                    pageParagraphs = new ArrayList<XWPFParagraph>();
                }
            }
        }

    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        try {
            doc.close();
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }
    }
}


private static List<Integer> getParagraphCountPerPage(XWPFDocument doc) throws Exception {
    List<Integer> paragraphCountList = new ArrayList<>();
    int paragraphCount = 0;

    Document domDoc = convertStringToDOM(doc.getDocument().getBody().toString());
    NodeList rootChildNodeList = domDoc.getChildNodes().item(0).getChildNodes();

    for (int i=0; i < rootChildNodeList.getLength(); i++) {
        Node childNode = rootChildNodeList.item(i);

        if (childNode.getNodeName().equals("w:p")) {
            paragraphCount++;

            if (childNode.getChildNodes() != null) {
                for (int k=0; k < childNode.getChildNodes().getLength(); k++) {
                    if (childNode.getChildNodes().item(k).getNodeName().equals("w:r")) {
                        for (int m=0; m < childNode.getChildNodes().item(k).getChildNodes().getLength(); m++) {
                            if (childNode.getChildNodes().item(k).getChildNodes().item(m).getNodeName().equals("w:br")) {

                                paragraphCountList.add(paragraphCount);
                                paragraphCount = 0;
                            }
                        }
                    }
                }
            }
        }
    }

    paragraphCountList.add(paragraphCount+1);

    return paragraphCountList;
}


private static Document convertStringToDOM(String xmlData) throws Exception {
    DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
    DocumentBuilder builder = factory.newDocumentBuilder();

    Document document = builder.parse(new InputSource(new StringReader(xmlData)));    

    return document;
}


private static void createPageInAnotherDocument(XWPFDocument outputDocument, List<XWPFParagraph> pageParagraphs) throws IOException {
    for (int i = 0; i < pageParagraphs.size(); i++) {
        addParagraphToDocument(outputDocument, pageParagraphs.get(i).getText());
    }
}


private static void addParagraphToDocument(XWPFDocument outputDocument, String text) throws IOException {
    XWPFParagraph paragraph = outputDocument.createParagraph();
    XWPFRun run = paragraph.createRun();
    run.setText(text);
}
TashaEls
  • 116
  • 2
  • Though it does not split the whole page content (only the text, I updated your answer so that it will also extract the full format of the paragraph) it is so far the only sound answer I had. Thanks Tasha – WiredCoder Jul 12 '16 at 21:18
  • not working.... "w:br" could not be found. didn't found any split page sign in dom. so maybe it's impossible. need help. – Nick Kao Jun 10 '21 at 09:10