1

I am reading excel files using C# and OpenXML (SAX). The file size ranges between 5-10 mb and has 5-6 sheets. Number of rows per sheet vary between 25-100K.

I am using the following code to fetch the data. It's reading about 100 rows per second. In comparison, Apache POI is able to read a thousand rows in the same time.

I expected better results as both products are from Microsoft. Am I doing something wrong?

using (SpreadsheetDocument excelDoc = SpreadsheetDocument.Open(filename, true))
{
    WorkbookPart workbookPart = excelDoc.WorkbookPart;
    WorksheetPart mainSheet = (WorksheetPart)workbookPart.GetPartById(sheetIds[0].ToString());
    OpenXmlReader reader = OpenXmlReader.Create(mainSheet);

    //CellType c;
    SharedStringTable t = workbookPart.SharedStringTablePart.SharedStringTable;

    while (reader.Read())
    {
        if (reader.ElementType == typeof(Row))
        {
            reader.ReadFirstChild();

            do
            {
                if (reader.ElementType == typeof(Cell))
                {
                    Cell c = (Cell)reader.LoadCurrentElement();

                    string cellValue;

                    if (c.DataType != null && c.DataType == CellValues.SharedString)
                    {
                        int index = int.Parse(c.CellValue.InnerText);
                        SharedStringItem ssi = workbookPart.SharedStringTablePart.SharedStringTable.Elements<SharedStringItem>().ElementAt(int.Parse(c.CellValue.InnerText));
                        cellValue = ssi.Text.Text;
                    }
                    else
                    {
                        cellValue = c.CellValue.InnerText;
                    }

                    //Console.Out.Write("{0}: {1} ", c.CellReference, cellValue);
                }
            }
            while (reader.ReadNextSibling());
        }
    }
}
Orijit
  • 63
  • 2
  • 7
  • 1
    I think this applies to you: https://stackoverflow.com/questions/10542719/net-openxml-performance-issues – FailedUnitTest May 26 '17 at 13:32
  • 3
    I think this is useful too - https://stackoverflow.com/questions/42508239/how-can-i-improve-the-performance-of-retrieving-values-from-sharedstringtable-in/42514872#42514872 – petelids May 26 '17 at 14:54
  • @petelids thanks a ton. I am able to read 90000 rows in about 5 seconds now. This is awesome. – Orijit May 29 '17 at 16:11
  • @FailedUnitTest The answer focuses on writing excel files. My logic involves only reading. – Orijit May 29 '17 at 16:13

1 Answers1

0

Please try below code. using (SpreadsheetDocument excelDoc = SpreadsheetDocument.Open(filename, true)) { WorkbookPart workbookPart = excelDoc.WorkbookPart; WorksheetPart mainSheet = (WorksheetPart)workbookPart.GetPartById(sheetIds[0].ToString()); OpenXmlReader reader = OpenXmlReader.Create(mainSheet);

    //CellType c;
    SharedStringTable t = workbookPart.SharedStringTablePart.SharedStringTable;

    while (reader.Read())
    {
        if (reader.ElementType == typeof(Row))
        {
            reader.ReadFirstChild();

            do
            {
                if (reader.ElementType == typeof(Cell))
                {
                    Cell c = (Cell)reader.LoadCurrentElement();

                    string cellValue;

                    if (c.DataType != null && c.DataType == CellValues.SharedString)
                    {
                        int index = int.Parse(c.CellValue.InnerText);
                        SharedStringItem ssi = t.Elements<SharedStringItem>().ElementAt(int.Parse(c.CellValue.InnerText));
                        cellValue = ssi.Text.Text;
                    }
                    else
                    {
                        cellValue = c.CellValue.InnerText;
                    }

                    //Console.Out.Write("{0}: {1} ", c.CellReference, cellValue);
                }
            }
            while (reader.ReadNextSibling());
        }
    }
}
SVaidya
  • 167
  • 2
  • 10