0

My decompressing worked fine for years!

But now I have some files that abort while decompressing after a few reads.

The decompression will abort after 694 records but there are 1'829'768 records!

I do not receive any error from the GZipStream.

I am afraid, the Problem is not the code but a bug or missing feature in GZipStream of Microsoft.

I assume that the file was compressed using some UNIX environment.

I can decompress it from the shell using WinRar or ZIP but I can't decompress them "on-the-fly", using GZipStream.

Decompressing first and processing afterwards is of course possible but not an option, because some of these files are to big. (Terra bytes). This would waste way to much time.

    using System;
    using System.Data;
    using System.IO;
    using System.IO.Compression;

    public static void Unpack(string p_sZippedFileName)
    {
        const int cRecordLenght = 986;
        byte[] ba = new byte[cRecordLenght]; 
        int iRecordCounter = 0;

        using (FileStream fileStream = new FileStream(p_sZippedFileName, FileMode.Open, FileAccess.Read))
        using (GZipStream zipStream = new GZipStream(fileStream, CompressionMode.Decompress))
        {
            while (zipStream.Read(ba, 0, cRecordLenght) != 0)
            {
                iRecordCounter++;
            }
            Console.WriteLine("\nfinished after " + iRecordCounter.ToString() + " records ");
        }
    }

    static void Main(string[] args)
    {
        Unpack(@"MyGzFile.gz");
    }

The compressed file is 15MB in size and 1.7GB uncompressed. It should have 1'829'768 records! Not 694!

Unfortunately you need to download the 15MB datafile to see this effect.

Download UNIX file that causes decompressing problems here

2 Answers2

0

That file is actually 175 gzip streams concatenated. A concatenation of valid gzip streams is also a valid gzip stream, per the standard, so the utilities will automatically decompress the next internal gzip stream when the last one finishes. However the library you are using stops at the end of the first gzip stream.

All you need to do is repeatedly use GZipStream until the entire file is consumed.

Here are the embedded gzip streams, as shown by pigz -ltv:

method    check    timestamp    compressed   original reduced  name
gzip 8  d3b504b5  Sep  5 16:15        5729     684284   99.2%  BFSQ101_1_hot...
gzip 8  ae275c49  Sep  5 16:15        6136     751332   99.2%  <...>
gzip 8  bdfc1fbe  Sep  5 16:15        2941     337212   99.1%  <...>
gzip 8  cc98f315  Sep  5 16:15        5185     567936   99.1%  <...>
gzip 8  9e5bd1e1  Sep  5 16:15        7765     739500   98.9%  <...>
gzip 8  3fb391df  Sep  5 16:15       33053    3184780   99.0%  <...>
gzip 8  b6949166  Sep  5 16:15       30367    3916392   99.2%  <...>
gzip 8  99e23f1c  Sep  5 16:15       21864    2879120   99.2%  <...>
gzip 8  aa1f465e  Sep  5 16:15       20451    2009468   99.0%  <...>
gzip 8  3d1235e8  Sep  5 16:15       35202    3445084   99.0%  <...>
gzip 8  88aa59b6  Sep  5 16:15        8877    1003748   99.1%  <...>
gzip 8  de849a6c  Sep  5 16:15       63357    8693562   99.3%  <...>
gzip 8  d16e00f0  Sep  5 16:15       20470    2888980   99.3%  <...>
gzip 8  65fdb6e3  Sep  5 16:15       77278    9244736   99.2%  <...>
gzip 8  f03ff362  Sep  5 16:15       20470    2888980   99.3%  <...>
gzip 8  1cc8c3c5  Sep  5 16:15      326938   44740736   99.3%  <...>
gzip 8  4f44a72d  Sep  5 16:15       65879    8338602   99.2%  <...>
gzip 8  dcefd273  Sep  5 16:15      422230   55740552   99.2%  <...>
gzip 8  5b465e02  Sep  5 16:15      483248   65961428   99.3%  <...>
gzip 8  47ea3377  Sep  5 16:15      268189   36305506   99.3%  <...>
gzip 8  87c40a36  Sep  5 16:15      492551   69229032   99.3%  <...>
gzip 8  e6ef226a  Sep  5 16:15       68290    8655108   99.2%  <...>
gzip 8  ed61760e  Sep  5 16:15      552074   75590704   99.3%  <...>
gzip 8  b6f9ba3c  Sep  5 16:15       52437    6164472   99.1%  <...>
gzip 8  543025f6  Sep  5 16:15       90414    9592794   99.1%  <...>
gzip 8  92ee391c  Sep  5 16:15      164410   20791782   99.2%  <...>
gzip 8  b7daa869  Sep  5 16:15      138439   17144568   99.2%  <...>
gzip 8  9e8f1d9a  Sep  5 16:15      159826   19611540   99.2%  <...>
gzip 8  9dd9ab4e  Sep  5 16:15      101783   12672072   99.2%  <...>
gzip 8  bf1da9bf  Sep  5 16:15      166638   21017576   99.2%  <...>
gzip 8  1fd50892  Sep  5 16:15       26224    3107872   99.2%  <...>
gzip 8  bba28914  Sep  5 16:15       20300    2677976   99.2%  <...>
gzip 8  a24aa94c  Sep  5 16:15        4449     500888   99.1%  <...>
gzip 8  ea7123bb  Sep  5 16:15       83639   10446670   99.2%  <...>
gzip 8  a069401a  Sep  5 16:15      166892   23182832   99.3%  <...>
gzip 8  da4e52d0  Sep  5 16:15       47183    6617046   99.3%  <...>
gzip 8  1216a764  Sep  5 16:15      361812   46365664   99.2%  <...>
gzip 8  d3323b7d  Sep  5 16:15       67408    8609752   99.2%  <...>
gzip 8  d4f43a26  Sep  5 16:15       70830    8303106   99.1%  <...>
gzip 8  b44e6519  Sep  5 16:15      213527   26468184   99.2%  <...>
gzip 8  128809df  Sep  5 16:15      220496   24231936   99.1%  <...>
gzip 8  be81d102  Sep  5 16:15      191825   26160552   99.3%  <...>
gzip 8  5f4eaa67  Sep  5 16:15      442912   59668776   99.3%  <...>
gzip 8  8c75e7d6  Sep  5 16:15      256035   30062154   99.1%  <...>
gzip 8  382ce67c  Sep  5 16:15      293235   37140648   99.2%  <...>
gzip 8  909d5680  Sep  5 16:15       37789    4632228   99.2%  <...>
gzip 8  0be66753  Sep  5 16:15       58300    6980880   99.2%  <...>
gzip 8  105487b8  Sep  5 16:15       76587    9386720   99.2%  <...>
gzip 8  54040854  Sep  5 16:15       37060    4823512   99.2%  <...>
gzip 8  c09af134  Sep  5 16:15        1094     107474   99.0%  <...>
gzip 8  ef2973a4  Sep  5 16:15        2838     321436   99.1%  <...>
gzip 8  7043559d  Sep  5 16:15       10112    1248276   99.2%  <...>
gzip 8  65f07ae4  Sep  5 16:15       24974    2567544   99.0%  <...>
gzip 8  aaf51505  Sep  5 16:15       14963    1484916   99.0%  <...>
gzip 8  33b3a1f3  Sep  5 16:15        2258     203116   98.9%  <...>
gzip 8  2b359a91  Sep  5 16:15        3501     452574   99.2%  <...>
gzip 8  20128129  Sep  5 16:15        7028     709920   99.0%  <...>
gzip 8  28a17134  Sep  5 16:15        6230     759220   99.2%  <...>
gzip 8  455551c1  Sep  5 16:15       26686    2283576   98.8%  <...>
gzip 8  7ecb639d  Sep  5 16:15       17271    1656480   99.0%  <...>
gzip 8  7c24a95a  Sep  5 16:15       10855    1400120   99.2%  <...>
gzip 8  2890071c  Sep  5 16:15        1685     134096   98.7%  <...>
gzip 8  a427db4e  Sep  5 16:15       25814    3395784   99.2%  <...>
gzip 8  5676ccd5  Sep  5 16:15       26993    3056600   99.1%  <...>
gzip 8  8fe1db04  Sep  5 16:15       41813    3892728   98.9%  <...>
gzip 8  4dcb3991  Sep  5 16:15      101698    9197408   98.9%  <...>
gzip 8  b18b43de  Sep  5 16:15       52157    4500104   98.8%  <...>
gzip 8  7987b02e  Sep  5 16:15       71437    7324008   99.0%  <...>
gzip 8  a376a3e7  Sep  5 16:15       31074    2927434   98.9%  <...>
gzip 8  6e4dce06  Sep  5 16:15      123099   14076136   99.1%  <...>
gzip 8  341979f0  Sep  5 16:15      288225   21903990   98.7%  <...>
gzip 8  ea761b90  Sep  5 16:15       47784    4192472   98.9%  <...>
gzip 8  4ca18278  Sep  5 16:15      204873   21037296   99.0%  <...>
gzip 8  6bf8d60b  Sep  5 16:15      841722   71006790   98.8%  <...>
gzip 8  c70cec31  Sep  5 16:15      201080   20694168   99.0%  <...>
gzip 8  623ff1a5  Sep  5 16:15       51395    4851120   98.9%  <...>
gzip 8  99cb59b2  Sep  5 16:15       46544    4456720   99.0%  <...>
gzip 8  ba4c92ae  Sep  5 16:15       51541    4740688   98.9%  <...>
gzip 8  eec6606f  Sep  5 16:15       51052    5492020   99.1%  <...>
gzip 8  f2c2b159  Sep  5 16:15      225744   25294844   99.1%  <...>
gzip 8  e6c56db8  Sep  5 16:15       80392    6810302   98.8%  <...>
gzip 8  0a47cbee  Sep  5 16:15       95976   10349056   99.1%  <...>
gzip 8  d53102ce  Sep  5 16:15      165369   19276300   99.1%  <...>
gzip 8  baa32abf  Sep  5 16:15      251861   21441556   98.8%  <...>
gzip 8  722a4e05  Sep  5 16:15       23970    2390064   99.0%  <...>
gzip 8  eac99b0a  Sep  5 16:15       88130    7671080   98.9%  <...>
gzip 8  da942a4a  Sep  5 16:15       44471    5105508   99.1%  <...>
gzip 8  ced9902a  Sep  5 16:15       95539    8428328   98.9%  <...>
gzip 8  51f34298  Sep  5 16:15       43365    3423392   98.7%  <...>
gzip 8  86ab080e  Sep  5 16:15      168730   13053654   98.7%  <...>
gzip 8  d73827bd  Sep  5 16:15      133057   15227784   99.1%  <...>
gzip 8  84528a4b  Sep  5 16:15       33025    3265632   99.0%  <...>
gzip 8  62ea51c1  Sep  5 16:15       60933    6927636   99.1%  <...>
gzip 8  295c9880  Sep  5 16:15       29900    2912644   99.0%  <...>
gzip 8  e78a773e  Sep  5 16:15         892      41412   97.8%  <...>
gzip 8  11e4a15e  Sep  5 16:15      151748   16496766   99.1%  <...>
gzip 8  11207fbd  Sep  5 16:15      222248   26417898   99.2%  <...>
gzip 8  82fa34c7  Sep  5 16:15      110255   13269588   99.2%  <...>
gzip 8  8bea1780  Sep  5 16:15      450044   41707800   98.9%  <...>
gzip 8  01cccc85  Sep  5 16:15      285831   31337052   99.1%  <...>
gzip 8  e7f8bbcb  Sep  5 16:15      212141   22906752   99.1%  <...>
gzip 8  94d0062b  Sep  5 16:15      109822   12459096   99.1%  <...>
gzip 8  0ab76f7d  Sep  5 16:15      374525   33351450   98.9%  <...>
gzip 8  9130a5e9  Sep  5 16:15      218858   16550010   98.7%  <...>
gzip 8  e388cba6  Sep  5 16:15      471713   43913482   98.9%  <...>
gzip 8  3a0d1dfb  Sep  5 16:15      459824   43920384   99.0%  <...>
gzip 8  91760e55  Sep  5 16:15      332786   39882714   99.2%  <...>
gzip 8  081ce788  Sep  5 16:15      305461   28560476   98.9%  <...>
gzip 8  16b514e2  Sep  5 16:15       13501    1337016   99.0%  <...>
gzip 8  b25c8acc  Sep  5 16:15       39768    3824694   99.0%  <...>
gzip 8  f0c73cf0  Sep  5 16:15       26173    2549796   99.0%  <...>
gzip 8  76d0f641  Sep  5 16:15       13254    1309408   99.0%  <...>
gzip 8  a40f8b76  Sep  5 16:15       40375    3842442   98.9%  <...>
gzip 8  1abee777  Sep  5 16:15       25598    2480776   99.0%  <...>
gzip 8  6ece3ccb  Sep  5 16:15       23133    2715444   99.1%  <...>
gzip 8  d2636064  Sep  5 16:15       28378    2786436   99.0%  <...>
gzip 8  5498097b  Sep  5 16:15       60657    5843036   99.0%  <...>
gzip 8  c6b77dac  Sep  5 16:15       54419    5353980   99.0%  <...>
gzip 8  8e7a1860  Sep  5 16:15       11370    1153620   99.0%  <...>
gzip 8  e463b182  Sep  5 16:15        8813    1043188   99.2%  <...>
gzip 8  564990ea  Sep  5 16:15        9313    1100376   99.2%  <...>
gzip 8  192a05bf  Sep  5 16:15       13451    1248276   98.9%  <...>
gzip 8  4982aad7  Sep  5 16:15        1685     147900   98.9%  <...>
gzip 8  aeda5155  Sep  5 16:15       15587    1833960   99.2%  <...>
gzip 8  8613238d  Sep  5 16:15        3567     295800   98.8%  <...>
gzip 8  928ff4ce  Sep  5 16:15       13135    1490832   99.1%  <...>
gzip 8  b3ec3f6f  Sep  5 16:15        4611     489056   99.1%  <...>
gzip 8  0df0f802  Sep  5 16:15       10261    1133900   99.1%  <...>
gzip 8  4049e745  Sep  5 16:15        1527     147900   99.0%  <...>
gzip 8  fd24e643  Sep  5 16:15       19300    2082432   99.1%  <...>
gzip 8  bb8a811a  Sep  5 16:15        1634     181424   99.1%  <...>
gzip 8  7813a0a0  Sep  5 16:15       35854    3332680   98.9%  <...>
gzip 8  d33c8708  Sep  5 16:15        8095     865708   99.1%  <...>
gzip 8  1ee8f774  Sep  5 16:15        9779    1072768   99.1%  <...>
gzip 8  553a9e50  Sep  5 16:15       10424    1112208   99.1%  <...>
gzip 8  00b5fa47  Sep  5 16:15        8579    1064880   99.2%  <...>
gzip 8  08241c48  Sep  5 16:15       10274    1072768   99.0%  <...>
gzip 8  76d80a6b  Sep  5 16:15        9859     928812   98.9%  <...>
gzip 8  35b49c5d  Sep  5 16:15        5509     496944   98.9%  <...>
gzip 8  a8047163  Sep  5 16:15        2201     266220   99.2%  <...>
gzip 8  ce6bbd8e  Sep  5 16:15        6933     932756   99.3%  <...>
gzip 8  bb2146ef  Sep  5 16:15        7116     958392   99.3%  <...>
gzip 8  c807d58a  Sep  5 16:15        4894     481168   99.0%  <...>
gzip 8  93f562cb  Sep  5 16:15        6542     836128   99.2%  <...>
gzip 8  7fb242eb  Sep  5 16:15       19818    2311184   99.1%  <...>
gzip 8  d112692f  Sep  5 16:15       15558    1517454   99.0%  <...>
gzip 8  a28d6731  Sep  5 16:15       11137    1181228   99.1%  <...>
gzip 8  e922c39b  Sep  5 16:15        6333     769080   99.2%  <...>
gzip 8  036f7259  Sep  5 16:15        7628     989944   99.2%  <...>
gzip 8  a2080a25  Sep  5 16:15       11757    1522384   99.2%  <...>
gzip 8  2003aee8  Sep  5 16:15        9109    1001776   99.1%  <...>
gzip 8  5932e9c1  Sep  5 16:15       10453    1202920   99.1%  <...>
gzip 8  1fcfa239  Sep  5 16:15        7629     989944   99.2%  <...>
gzip 8  7ff3cd6d  Sep  5 16:15        7616     989944   99.2%  <...>
gzip 8  52a70d28  Sep  5 16:15       17018    2062712   99.2%  <...>
gzip 8  51ed84ac  Sep  5 16:15       38403    4831400   99.2%  <...>
gzip 8  c6afdd2d  Sep  5 16:15       29520    3277464   99.1%  <...>
gzip 8  5394e776  Sep  5 16:15       59291    7933356   99.3%  <...>
gzip 8  428b60b0  Sep  5 16:15       22275    2449224   99.1%  <...>
gzip 8  cffab915  Sep  5 16:15       30942    3407616   99.1%  <...>
gzip 8  2499f8d0  Sep  5 16:15       11122    1387302   99.2%  <...>
gzip 8  32348d9c  Sep  5 16:15       56977    6704800   99.2%  <...>
gzip 8  2ce9d004  Sep  5 16:15       25986    3005328   99.1%  <...>
gzip 8  6f6c8dff  Sep  5 16:15       20414    2875176   99.3%  <...>
gzip 8  8c0f4f3e  Sep  5 16:15       17090    2204696   99.2%  <...>
gzip 8  c4e49446  Sep  5 16:15       45659    5324400   99.1%  <...>
gzip 8  7daad546  Sep  5 16:15       24152    3241968   99.3%  <...>
gzip 8  3a028387  Sep  5 16:15       85243    8566368   99.0%  <...>
gzip 8  b2da9025  Sep  5 16:15      360323   30704040   98.8%  <...>
gzip 8  d22669f8  Sep  5 16:15       82586    7409790   98.9%  <...>
gzip 8  f5b3c916  Sep  5 16:15       89830    9906342   99.1%  <...>
gzip 8  b75ed982  Sep  5 16:15      141070   12332888   98.9%  <...>
gzip 8  05cc92e8  Sep  5 16:15       51712    5825288   99.1%  <...>
gzip 8  9d5fc6ca  Sep  5 16:15      148927   12666156   98.8%  <...>
gzip 8  640d5cee  Sep  5 16:15       11492    1120096   99.0%  <...>
Mark Adler
  • 101,978
  • 13
  • 118
  • 158
0

This is how it works.

        const int cRecordLenght = 986;
        const int cSpoolerLenght = 1000000;
        byte[] ba = new byte[cRecordLenght];
        byte[] baSpooler = new byte[cSpoolerLenght];

        int iRecordCounter = 0;
        int iFileCounter = 0;
        int iLenght = 0;

        long lLastSeekPosition = 0;
        long lSeekPosition = 0;

        using (FileStream fileStream = new FileStream(p_sZippedFileName, FileMode.Open, FileAccess.Read))
        {
            while (true)
            {
                fileStream.Seek(lSeekPosition, SeekOrigin.Begin);
                Console.Write("Stream begins at :" + lSeekPosition.ToString());

                using (GZipStream zipStream = new GZipStream(fileStream, CompressionMode.Decompress, true))
                {
                    while (zipStream.Read(ba, 0, cRecordLenght) > 0)
                    {
                        iRecordCounter++;
                    }
                    Console.WriteLine("\nStream: " + iFileCounter.ToString() + " Records: " + iRecordCounter.ToString() + " Pos: " + lSeekPosition.ToString() + " Pos2: " + lLastSeekPosition.ToString() +  " Len: " + ( lLastSeekPosition - lSeekPosition ).ToString() + " Len2: " + iLenght.ToString() );
                    iFileCounter++;
                }

                // where is the filepointer? Suche zwischen letzer Position und neuer Position nach 1f b8 08 08 
                // go back and look for "1f b8"
                lLastSeekPosition = fileStream.Position;
                fileStream.Seek(lSeekPosition + 4, SeekOrigin.Begin); // + 4 skip last 1F b8 08 08 pattern
                iLenght = fileStream.Read(baSpooler, 0, (int)(lLastSeekPosition - lSeekPosition));

                for (int i = 0; i < iLenght; i++)
                {
                    if ((baSpooler[i] == 0x1f) && (baSpooler[i + 1] == 0x8b) && (baSpooler[i + 2] == 0x08) && (baSpooler[i + 3] == 0x08))// Stream ends here (31 / 139)
                    {
                        lSeekPosition = lSeekPosition + i + 4;
                        break;
                    }
                }
            }
        }