2

Anyone knows how to extract links from word documents using Apache POI? Or even better, from a paragraph?

Deduplicator
  • 44,692
  • 7
  • 66
  • 118
Ron
  • 393
  • 1
  • 4
  • 13

1 Answers1

5

Word 2003 and newer :

//Links extractor
StringBuffer text = null;
try {
    FileInputStream fis = new FileInputStream(new File("YOUR_DOCX_FULL_PATH_HERE"));
    XWPFDocument document = new XWPFDocument(fis);
    text = new StringBuffer();  
    
    // First up, all our paragraph based text
    Iterator<XWPFParagraph> i = document.getParagraphsIterator();
    while(i.hasNext()) {
        XWPFParagraph paragraph = i.next();

        // Do the paragraph text
        for(XWPFRun run : paragraph.getRuns()) {
           
           if(run instanceof XWPFHyperlinkRun) {
               text.append(run.toString());
               bean.setName(run.toString());
               XWPFHyperlink link = ((XWPFHyperlinkRun)run).getHyperlink(document);
               if(link != null) {
                   text.append(" <" + link.getURL() + ">");
               }
           }
        }
    }
} catch (Exception e) {
    e.printStackTrace();
} 
Pavneet_Singh
  • 36,884
  • 5
  • 53
  • 68
Marcos
  • 51
  • 1
  • 2
  • It's generally best not to open a XWFP/XSSF/XSLF instance from an InputStream if you have the file, as it causes the whole lot to have to be buffered into memory. Much better to open directly via the File instead – Gagravarr May 02 '12 at 13:45