33

I'm trying to figure out the best way to compress a stream with Python's zlib.

I've got a file-like input stream (input, below) and an output function which accepts a file-like (output_function, below):

with open("file") as input:
    output_function(input)

And I'd like to gzip-compress input chunks before sending them to output_function:

with open("file") as input:
    output_function(gzip_stream(input))

It looks like the gzip module assumes that either the input or the output will be a gzip'd file-on-disk… So I assume that the zlib module is what I want.

However, it doesn't natively offer a simple way to create a stream file-like… And the stream-compression it does support comes by way of manually adding data to a compression buffer, then flushing that buffer.

Of course, I could write a wrapper around zlib.Compress.compress and zlib.Compress.flush (Compress is returned by zlib.compressobj()), but I'd be worried about getting buffer sizes wrong, or something similar.

So, what's the simplest way to create a streaming, gzip-compressing file-like with Python?

Edit: To clarify, the input stream and the compressed output stream are both too large to fit in memory, so something like output_function(StringIO(zlib.compress(input.read()))) doesn't really solve the problem.

David Wolever
  • 148,955
  • 89
  • 346
  • 502
  • I've found an implementation of the opposite thing - a file-like decompress a gzip'd stream - over at effbot: http://effbot.org/librarybook/zlib.htm … But I'm looking for the opposite (although I suppose it could be helpful if I need to write my own) – David Wolever Feb 03 '10 at 14:35

6 Answers6

15

It's quite kludgy (self referencing, etc; just put a few minutes writing it, nothing really elegant), but it does what you want if you're still interested in using gzip instead of zlib directly.

Basically, GzipWrap is a (very limited) file-like object that produces a gzipped file out of a given iterable (e.g., a file-like object, a list of strings, any generator...)

Of course, it produces binary so there was no sense in implementing "readline".

You should be able to expand it to cover other cases or to be used as an iterable object itself.

from gzip import GzipFile

class GzipWrap(object):
    # input is a filelike object that feeds the input
    def __init__(self, input, filename = None):
        self.input = input
        self.buffer = ''
        self.zipper = GzipFile(filename, mode = 'wb', fileobj = self)

    def read(self, size=-1):
        if (size < 0) or len(self.buffer) < size:
            for s in self.input:
                self.zipper.write(s)
                if size > 0 and len(self.buffer) >= size:
                    self.zipper.flush()
                    break
            else:
                self.zipper.close()
            if size < 0:
                ret = self.buffer
                self.buffer = ''
        else:
            ret, self.buffer = self.buffer[:size], self.buffer[size:]
        return ret

    def flush(self):
        pass

    def write(self, data):
        self.buffer += data

    def close(self):
        self.input.close()
Ricardo Cárdenes
  • 9,004
  • 1
  • 21
  • 34
  • 1
    (ok, so I see your point that it's not particularly elegant to pass 'self' to the GzipFile… But I still think it's a neat hack). – David Wolever Feb 03 '10 at 16:38
  • I've corrected a little bug in the code. When reading with size < 0, it didn't clear the buffer. I don't think you'll be using it like that, but a bug is a bug... O:) – Ricardo Cárdenes Feb 03 '10 at 16:40
  • 6
    why isn't something like this provided by the standard library? What good is a gzip utility that doesn't stream? It's usually going to be the case that the whole file won't fit in memory (it's gzipped for a reason after all). – Kevin Feb 15 '12 at 15:25
  • I guess they included it for internal use. Haven't checked if it's changed during the last year, thought... – Ricardo Cárdenes Feb 15 '12 at 18:20
  • 1
    The current version of this answer reads one line at a time into memory. If the file you are compressing has very few newlines, this can be a problem. – Talia Jul 22 '15 at 14:00
11

Here is a cleaner, non-self-referencing version based on Ricardo Cárdenes' very helpful answer.

from gzip import GzipFile
from collections import deque


CHUNK = 16 * 1024


class Buffer (object):
    def __init__ (self):
        self.__buf = deque()
        self.__size = 0
    def __len__ (self):
        return self.__size
    def write (self, data):
        self.__buf.append(data)
        self.__size += len(data)
    def read (self, size=-1):
        if size < 0: size = self.__size
        ret_list = []
        while size > 0 and len(self.__buf):
            s = self.__buf.popleft()
            size -= len(s)
            ret_list.append(s)
        if size < 0:
            ret_list[-1], remainder = ret_list[-1][:size], ret_list[-1][size:]
            self.__buf.appendleft(remainder)
        ret = ''.join(ret_list)
        self.__size -= len(ret)
        return ret
    def flush (self):
        pass
    def close (self):
        pass


class GzipCompressReadStream (object):
    def __init__ (self, fileobj):
        self.__input = fileobj
        self.__buf = Buffer()
        self.__gzip = GzipFile(None, mode='wb', fileobj=self.__buf)
    def read (self, size=-1):
        while size < 0 or len(self.__buf) < size:
            s = self.__input.read(CHUNK)
            if not s:
                self.__gzip.close()
                break
            self.__gzip.write(s)
        return self.__buf.read(size)

Advantages:

  • Avoids repeated string concatenation, which would cause the entire string to be copied repeatedly.
  • Reads a fixed CHUNK size from the input stream, instead of reading whole lines at a time (which can be arbitrarily long).
  • Avoids circular references.
  • Avoids misleading public "write" method of GzipCompressStream(), which is really only used internally.
  • Takes advantage of name mangling for internal member variables.
Talia
  • 1,400
  • 2
  • 10
  • 33
  • 1
    For python 3 use `ret = b''.join(ret_list)` instead, and if your input stream is string then you have to encode e.g. `self.__gzip.write(s.encode('utf-8')` – dstandish Feb 25 '19 at 18:29
6

The gzip module supports compressing to a file-like object, pass a fileobj parameter to GzipFile, as well as a filename. The filename you pass in doesn't need to exist, but the gzip header has a filename field which needs to be filled out.

Update

This answer does not work. Example:

# tmp/try-gzip.py 
import sys
import gzip

fd=gzip.GzipFile(fileobj=sys.stdin)
sys.stdout.write(fd.read())

output:

===> cat .bash_history  | python tmp/try-gzip.py  > tmp/history.gzip
Traceback (most recent call last):
  File "tmp/try-gzip.py", line 7, in <module>
    sys.stdout.write(fd.read())
  File "/usr/lib/python2.7/gzip.py", line 254, in read
    self._read(readsize)
  File "/usr/lib/python2.7/gzip.py", line 288, in _read
    pos = self.fileobj.tell()   # Save current position
IOError: [Errno 29] Illegal seek
guettli
  • 25,042
  • 81
  • 346
  • 663
user249290
  • 116
  • 1
  • Mmmm… I hadn't noticed that… But I'm not sure it will work: either the `fileobj` must be a gzip'd input stream, or an output stream which the gzip'd data will be written to. So, better than nothing, but still not quite what I'd like. – David Wolever Feb 03 '10 at 14:41
  • 2
    Please explain why this does not solve your problem. I use `fd=gzip.GzipFile(fileobj=fd)` and it works like it should. – guettli Apr 24 '13 at 13:28
  • @guettli the author is expecting that the `fd` object does not have `seek` method. – Andrey Cizov Nov 04 '15 at 15:34
  • @AndreyCizov yes, you are right. I checked it and edited the answer. I hope user249290 has nothing against this. – guettli Nov 04 '15 at 16:06
2

Use the cStringIO (or StringIO) module in conjunction with zlib:

>>> import zlib
>>> from cStringIO import StringIO
>>> s.write(zlib.compress("I'm a lumberjack"))
>>> s.seek(0)
>>> zlib.decompress(s.read())
"I'm a lumberjack"
jcdyer
  • 18,616
  • 5
  • 42
  • 49
  • 8
    The problem with this, though, is that the entire input stream must be loaded into memory (when it's passed to `zlib.compress`) and then must be loaded into memory *again* when it is returned from `zlib.decompress`. – David Wolever Feb 03 '10 at 15:05
  • 1
    It never leaves memory, if you use StringIO. You said in your question that you wanted a "file-like object", which is common python terminology for an object that has similar methods to a file object. It doesn't say anything about whether it lives on disk or not. But then you also suggested that you didn't want a gz file. Can you please be more clear about what you are really looking for? – jcdyer Feb 03 '10 at 15:49
  • Err, sorry - yes, that is my fault. In my mind "file-like object" implies "something intended to be processed in chunks", but I guess that's a faulty assumption. I have updated the question. – David Wolever Feb 03 '10 at 16:05
  • 1
    have you looked at `zlib.compressobj()` and `zlib.decompressobj()`? Perfect for chunking. – jcdyer Feb 03 '10 at 17:29
  • Yup, I have. As I mentioned (albeit not very clearly), they work, but their interface isn't very standard, and it could depend on my getting things like buffer sizes correct. – David Wolever Feb 03 '10 at 18:01
0

This works (at least in python 3):

with s3.open(path, 'wb') as f:
    gz = gzip.GzipFile(filename, 'wb', 9, f)
    gz.write(b'hello')
    gz.flush()
    gz.close()

Here it writes to s3fs's file object with a gzip compression on it. The magic is the f parameter, which is GzipFile's fileobj. You have to provide a file name for gzip's header.

user582175
  • 954
  • 12
  • 17
0

An even cleaner & more generalized version made of reusable components:

gzipped_iter = igizip(io_iter(input_file_obj))
gzipped_file_obj = iter_io(prefetch(gzipped_iter))

The functions above are from my gist:

  • iter_io and io_iter provide transparent conversion to/from Iterable[AnyStr] <-> SupportsRead[AnyStr]
  • igzip does streaming gzip compression
  • (optional) prefetch concurrently pulls from an underlying iterable via a thread, yielding to consumer as normal, for concurrent read/write
def as_bytes(s: str | bytes):
    if type(s) not in [str, bytes]:
        raise TypeError
    return s.encode() if isinstance(s, str) else s


def iter_io(iterable: Iterable[AnyStr], buffer_size: int = io.DEFAULT_BUFFER_SIZE):
    """
    Returns a buffered file obj that reads bytes from an iterable of str/bytes.

    Example:

    iter_io(['abc', 'def', 'g']).read() == b'abcdefg'
    iter_io([b'abcd', b'efg']).read(5) == b'abcde'
    """
    class IterIO(io.RawIOBase):
        def __init__(self, iterable: Iterable[AnyStr]):
            self._leftover = b''
            self._iterable = (as_bytes(s) for s in iterable if s)

        def readable(self):
            return True

        def readinto(self, buf):
            try:
                chunk = self._leftover or next(self._iterable)
            except StopIteration:
                return 0    # indicate EOF

            output, self._leftover = chunk[:len(buf)], chunk[len(buf):]
            buf[:len(output)] = output
            return len(output)

    return io.BufferedReader(IterIO(iterable), buffer_size=buffer_size)


def io_iter(fo: SupportsRead[AnyStr], size: int = io.DEFAULT_BUFFER_SIZE):
    """
    Returns an iterator that reads from a file obj in sized chunks.

    Example:

    list(io_iter(io.StringIO('abcdefg'), 3)) == ['abc', 'def', 'g']
    list(io_iter(io.BytesIO(b'abcdefg'), 4)) == [b'abcd', b'efg']

    Usage notes/TODO:
     * file obj isn't closed, fix /w keep_open=False and an internal contextmanager
    """
    return iter(lambda: fo.read(size), fo.read(0))


def igzip(chunks: Iterable[AnyStr], level=zlib.Z_DEFAULT_COMPRESSION):
    """
    Streaming gzip: lazily compresses an iterable of bytes or str (utf8)

    Example:

    gzipped_bytes_iter = igzip(['hello ', 'world!'])
    gzip.decompress(b''.join(gzipped_bytes_iter)).encode() == 'hello world!'
    """
    def gen():
        gzip_format = 0b10000
        c = zlib.compressobj(level=level, wbits=zlib.MAX_WBITS + gzip_format)

        yield from (c.compress(as_bytes(chunk)) for chunk in chunks)
        yield c.flush()

    return filter(None, gen())


def prefetch(iterable: Iterable[Any], n: int = 1) -> Iterator[Any]:
    """
    Prefetch an iterable via thread, yielding original contents as normal.

    Example:

    def slow_produce(*args):
        for x in args:
            time.sleep(1)
            yield x

    def slow_consume(iterable):
        for _ in iterable:
            time.sleep(1)

    slow_consume(prefetch(slow_produce('a', 'b')))  # takes 3 sec, not 4

    # Prefetch
    # produce: | 'a' | 'b' |
    # consume:       | 'a' | 'b' |
    # seconds: 0 --- 1 --- 2 --- 3

    # No prefetch
    # produce: | 'a' |     | 'b' |
    # consume:       | 'a' |     | 'b' |
    # seconds: 0 --- 1 --- 2 --- 3 --- 4

    Usage notes/TODO:
     * mem leak: Thread is GC'd only after iterable is fully consumed, fix /w __del__
    """
    queue = Queue(n)
    finished = object()

    def produce():
        for x in iterable:
            queue.put(x)
        queue.put(finished)

    t = Thread(target=produce, daemon=True)
    t.start()

    while True:
        item = queue.get()
        if item is finished:
            break
        else:
            yield item
Kache
  • 15,647
  • 12
  • 51
  • 79