0

I'm attempting to write a parser to extract details from a word document using NPOI. I'm able to retrieve details from each table in the document but I need to be able to identify which section of the document the table comes from in order to differentiate between them. While I can identify all of the lines that have the specific heading type I need, I can't work out how to tell which heading precedes which table.

Can anybody offer any advice? If it's not possible with NPOI, can anybody recommend another way to do it?

Wax_Lee
  • 13
  • 3

1 Answers1

0

If you are parsing word document. I'll suggest you to use OpenXMlpowertool by Eric white, download it from NuGet package manager or download directly from net.

Here is the code snippet i have used to parse document , code snippet is very small, clean and stable. You must first debug it to understand it working which will help you to customize it for yourself. it will read all text, paragraphs , bullets and contents etc. go through the documentation of Eric White for more details but below code snippet is the most you'll need to parse and top of it you can build your functionality.

using DocumentFormat.OpenXml.Packaging;
using OpenXmlPowerTools;

private static WordprocessingDocument _wordDocument;

_wordDocument = WordprocessingDocument.Open(wordFileStream, false); //  stream wordFileStream  in constructor


// To get header and footer use this 

var headerList = _wordDocument.MainDocumentPart.HeaderParts.ToList();
var footerList = _wordDocument.MainDocumentPart.FooterParts.ToList();

 private void GetDocumentBodyContents()
   {

    List<string> allList = new List<string>();
    List<string> allListText = new List<string>();

     try
       {
         //RevisionAccepter.AcceptRevisions(_wordDocument);
         XElement root = _wordDocument.MainDocumentPart.GetXDocument().Root;
         XElement body = root.LogicalChildrenContent().First();
         OutputBlockLevelContent(_wordDocument, body);
       }
       catch (Exception ex)
        { }
  }


 private void OutputBlockLevelContent(WordprocessingDocument wordDoc, XElement blockLevelContentContainer)
            {
                try
                {
                    string currentItem = string.Empty, currentItemText = string.Empty, numberText = string.Empty;
                    foreach (XElement blockLevelContentElement in
                        blockLevelContentContainer.LogicalChildrenContent())
                    {
                        if (blockLevelContentElement.Name == W.p)
                        {
                            currentItem = ListItemRetriever.RetrieveListItem(wordDoc, blockLevelContentElement);

                            //currentItemText = blockLevelContentElement
                            //    .LogicalChildrenContent(W.r)
                            //    .LogicalChildrenContent(W.t)
                            //    .Select(t => (string)t)
                            //    .StringConcatenate();

                            currentItemText = blockLevelContentElement
                               .LogicalChildrenContent(W.r)
                               .Select(t =>
                               {
                                   if (t.LogicalChildrenContent(W.br).Count() > 0)
                                   {
                                       //Adding line Break for Steps because it is truncated when typecaste with String
                                       t.SetElementValue(W.br, "<br />");

                                   }
                                   return (string)t;
                               }
                                   ).StringConcatenate(); 

                            continue;
                        }

                        // If element is not a paragraph, it must be a table.
                        foreach (var row in blockLevelContentElement.LogicalChildrenContent())
                        {
                            foreach (var cell in row.LogicalChildrenContent())
                            {
                                // Cells are a block-level content container, so can call this method recursively.
                                OutputBlockLevelContent(wordDoc, cell);
                            }
                        }
                    }               
                }
                catch (Exception ex)
                {

                }
}
kumar chandraketu
  • 2,232
  • 2
  • 20
  • 25