18

I'm trying to compress an string in one module and decompressing it in another module. Here is the code I'm using.

Compress

public static string CompressString(string text)
{
    byte[] buffer = Encoding.ASCII.GetBytes(text);
    MemoryStream ms = new MemoryStream();
    using (GZipStream zip = new GZipStream(ms, CompressionMode.Compress, true))
    {
         zip.Write(buffer, 0, buffer.Length);
    }

    ms.Position = 0;
    MemoryStream outStream = new MemoryStream();

    byte[] compressed = new byte[ms.Length];
    ms.Read(compressed, 0, compressed.Length);

    byte[] gzBuffer = new byte[compressed.Length + 4];
    System.Buffer.BlockCopy(compressed, 0, gzBuffer, 4, compressed.Length);
    System.Buffer.BlockCopy(BitConverter.GetBytes(buffer.Length), 0, gzBuffer, 0, 4);
    return Convert.ToBase64String(gzBuffer);
}

Decompress

public static byte[] DecompressString(byte[] data)
{
   using (var compressedStream = new MemoryStream(data))
   using (var zipStream = new GZipStream(compressedStream, CompressionMode.Decompress))
     using (var resultStream = new MemoryStream())
     {
        zipStream.CopyTo(resultStream);
        return resultStream.ToArray();
     }
}

Using it as:

 DecompressString(System.Text.Encoding.ASCII.GetBytes(ip));

But, for above statement, I'm getting following error.

{"The magic number in GZip header is not correct. Make sure you are passing in a GZip stream."} System.SystemException {System.IO.InvalidDataException}

EstevaoLuis
  • 2,422
  • 7
  • 33
  • 40
benjamin54
  • 1,280
  • 4
  • 28
  • 48
  • 2
    Uhm, you're base64 encoding the result of compression but you're not decoding it before decompressing? – Lasse V. Karlsen Aug 05 '14 at 09:06
  • 1
    Indeed - `DecompressString(Convert.FromBase64String(ip));` would make more sense. (Although I'd also suggest that your methods should be symmetric, e.g. make them both accept and return strings, or maybe `byte[] Compress(string)` and `string Decompress(byte[])`. – Jon Skeet Aug 05 '14 at 09:06
  • Additionally you're stuffing in the length of the compressed data at the start of those bytes, you need to remove those as well before decompressing. Additionally it would make more sense to store the length of the uncompressed data, to handle slight problems at the end of decompression. – Lasse V. Karlsen Aug 05 '14 at 09:08

2 Answers2

26

Here is a rewrite of your code that should work the way you want it to.

I wrote it in LINQPad and it can be tested in that.

Note that there's very little error checking here. You should add checks to see if all read operations complete and has actually read what they were supposed to and similar checks.

The output

original: 256
This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test.  

compressed: 56
AAEAAB+LCAAAAAAABAALycgsVgCiRIWS1OISPYWQEcYHANU9d5YAAQAA 

decompressed: 256
This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test. This is a test.  

The program

void Main()
{
    var input = "This is a test. This is a test. ";
    input += input;
    input += input;
    input += input;
    string compressed = Compress(input);
    string decompressed = Decompress(compressed);

    input.Dump("original: " + input.Length);
    compressed.Dump("compressed: " + compressed.Length);
    decompressed.Dump("decompressed: " + decompressed.Length);
}

public static string Decompress(string input)
{
    byte[] compressed = Convert.FromBase64String(input);
    byte[] decompressed = Decompress(compressed);
    return Encoding.UTF8.GetString(decompressed);
}

public static string Compress(string input)
{
    byte[] encoded = Encoding.UTF8.GetBytes(input);
    byte[] compressed = Compress(encoded);
    return Convert.ToBase64String(compressed);
}

public static byte[] Decompress(byte[] input)
{
    using (var source = new MemoryStream(input))
    {
        byte[] lengthBytes = new byte[4];
        source.Read(lengthBytes, 0, 4);

        var length = BitConverter.ToInt32(lengthBytes, 0);
        using (var decompressionStream = new GZipStream(source,
            CompressionMode.Decompress))
        {
            var result = new byte[length];
            decompressionStream.Read(result, 0, length);
            return result;
        }
    }
}

public static byte[] Compress(byte[] input)
{
    using (var result = new MemoryStream())
    {
        var lengthBytes = BitConverter.GetBytes(input.Length);
        result.Write(lengthBytes, 0, 4);

        using (var compressionStream = new GZipStream(result,
            CompressionMode.Compress))
        {
            compressionStream.Write(input, 0, input.Length);
            compressionStream.Flush();

        }
        return result.ToArray();
    }
}
Lasse V. Karlsen
  • 380,855
  • 102
  • 628
  • 825
  • Note that I'm not positive that you need to write out the length of the uncompressed data before the compressed data. With some compression methods the last byte or code can be interpreted as more data if you try to read more data than was present when compressing, due to leftover bits, but I'm not sure this holds for gzip, so that part may be unnecessary. – Lasse V. Karlsen Aug 05 '14 at 09:57
  • In fact testing by padding from 1 to 1000 additional characters to the end of the input seems to suggest this is not needed. Someone that knows gzip would have to comment though. – Lasse V. Karlsen Aug 05 '14 at 09:59
2

.NET 6+ solution (with fix of https://github.com/dotnet/runtime/issues/64577):

void Main()
{
    var input = "This is a test. This is a test. ";
    input += input;
    input += input;
    input += input;
    string compressed = Compress(input);
    string decompressed = Decompress(compressed);

    input.Dump("original: " + input.Length);
    compressed.Dump("compressed: " + compressed.Length);
    decompressed.Dump("decompressed: " + decompressed.Length);
}

public static string Decompress(string input)
{
    byte[] compressed = Convert.FromBase64String(input);
    byte[] decompressed = Decompress(compressed);
    return Encoding.UTF8.GetString(decompressed);
}

public static string Compress(string input)
{
    byte[] encoded = Encoding.UTF8.GetBytes(input);
    byte[] compressed = Compress(encoded);
    return Convert.ToBase64String(compressed);
}

public static byte[] Decompress(byte[] input)
{
    using (var source = new MemoryStream(input))
    {
        byte[] lengthBytes = new byte[4];
        source.Read(lengthBytes, 0, 4);

        var length = BitConverter.ToInt32(lengthBytes, 0);
        using (var decompressionStream = new GZipStream(source,
            CompressionMode.Decompress))
        {
            var result = new byte[length];
            int totalRead = 0, bytesRead;
            while ((bytesRead = decompressionStream.Read(result, totalRead, length - totalRead)) > 0)
            {
              totalRead += bytesRead;
            }

            return result;
        }
    }
}

public static byte[] Compress(byte[] input)
{
    using (var result = new MemoryStream())
    {
        var lengthBytes = BitConverter.GetBytes(input.Length);
        result.Write(lengthBytes, 0, 4);

        using (var compressionStream = new GZipStream(result,
            CompressionMode.Compress))
        {
            compressionStream.Write(input, 0, input.Length);
            compressionStream.Flush();

        }
        return result.ToArray();
    }
}
DaGrisa
  • 21
  • 2
  • 1
    It would be good to explain the differences and reasoning in the answer, and for .NET 6 use `Span` where possible/meaningful. – Ray Mar 17 '23 at 09:48