PyLongObject memory leak

Question

There is a memory leak in my encode method.

static const size_t _bits_per_digit = sizeof(digit) * CHAR_BIT;

/**
 * Returns a Python int storing the bitstream of the Elias gamma encoded input list.
 */
static PyObject *
encode(PyObject *self, PyObject *obj)
{
    if (!PyList_Check(obj)) {
        PyErr_SetString(PyExc_TypeError, "Input to encode must be a list");
        return NULL;
    }

    const Py_ssize_t list_length = PyList_GET_SIZE(obj);
    size_t size = _ceil_div((size_t) PyList_GET_SIZE(obj), _bits_per_digit);

    size_t current_digit = 0;
    size_t current_bit = 0;

    PyLongObject *bstream = _PyLong_New(size);
    if (bstream == NULL) {
        return NULL; // _PyLong_New sets errors
    }

    for (Py_ssize_t i = 0; i < list_length; i++)
    {
        PyObject *item = PyList_GET_ITEM(obj, i);
        unsigned long number = PyLong_AsUnsignedLong(item);
        if (PyErr_Occurred()) { // number overflowed or was negative
            PyErr_Format(PyExc_ValueError, "Number at list index %zd was negative or too large", i);
            return NULL;
        } else if (number == 0) { // unencodable
            PyErr_Format(PyExc_ValueError, "Zero at list index %zd", i);
            return NULL;
        }
        size_t N = _fast_floor_log2(number);
        size_t required_size = _ceil_div(current_digit * _bits_per_digit + current_bit + N * 2 + 1 + list_length - i - 1, _bits_per_digit);
        if (size < required_size) {
            size = required_size;
            PyLongObject *new = (PyLongObject *) PyObject_Realloc(bstream, offsetof(PyLongObject, ob_digit) + size * _bits_per_digit);
            if(new == NULL) {
                PyObject_Free(bstream);
                return PyErr_NoMemory();
            }
            bstream = new;
            PyObject_InitVar((PyVarObject*)bstream, &PyLong_Type, size);
        }
        // write leading zeroes
        size_t zeroes_left = N;
        while (zeroes_left > 0) {
            if (PyErr_CheckSignals()) {
                return NULL;
            }
            if (current_bit == 0) {
                bstream->ob_digit[current_digit] = (digit) 0;
                if (zeroes_left > _bits_per_digit) {
                    zeroes_left -= _bits_per_digit;
                    ++current_digit;
                } else {
                    current_bit += zeroes_left;
                    if (current_bit == _bits_per_digit) {
                        current_bit = 0;
                        ++current_digit;
                    }
                    zeroes_left = 0;
                }
            } else {
                digit mask = ~((digit)-1 >> current_bit); // enable the current_bit most significant bits
                bstream->ob_digit[current_digit] &= mask; // set remainder of this digit to zeroes
                digit zeroes_written = _bits_per_digit - current_bit;
                if (zeroes_left > zeroes_written) {
                    zeroes_left -= zeroes_written;
                    current_bit = 0;
                    ++current_digit;
                } else {
                    current_bit += zeroes_left;
                    if (current_bit == _bits_per_digit) {
                        current_bit = 0;
                        ++current_digit;
                    }
                    zeroes_left = 0;
                }
            }
        }
        // write the binary representation of number
        size_t bits_left = N + 1;
        while (bits_left > 0) {
            if (PyErr_CheckSignals()) {
                return NULL;
            }
            unsigned long mask = 1UL << (bits_left - 1);
            if (number & mask) {
                bstream->ob_digit[current_digit] |= ~((digit) -1 >> 1) >> current_bit;
            } else {
                bstream->ob_digit[current_digit] &= ~(~((digit) -1 >> 1) >> current_bit);
            }
            ++current_bit;
            if (current_bit == _bits_per_digit) {
                current_bit = 0;
                ++current_digit;
            }
            --bits_left;
            mask >>= 1;
        }
    }

    // remove unused digits
    if ((size - current_digit) > 1) {
        size = current_digit + 1;
        PyLongObject *new = (PyLongObject *) PyObject_Realloc(bstream, offsetof(PyLongObject, ob_digit) + size * _bits_per_digit);
        if(new == NULL) {
            PyObject_Free(bstream);
            return PyErr_NoMemory();
        }
        bstream = new;
        PyObject_InitVar((PyVarObject*)bstream, &PyLong_Type, size);
    }

    // fill leftover with zeroes, overwrite malloc randomness
    digit mask = ~((digit) -1 >> current_bit);
    bstream->ob_digit[current_digit] &= mask;

    return (PyObject *) bstream;
}

In Python

tracemalloc.start()
encoded = encode(numbers)
sleep(3) # give garbage collection time to run
size, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
print("size, peak", size, peak)
print(sys.getsizeof(encoded))

prints

size, peak 17053496 17053496
2131708

Instead of the expected

size, peak 2131708 ...
2131708

Using my own type instead of PyLongObject, which looks identical to my eyes, leads to no memory leak at all and the output

size, peak 2131309 2131309
2131309

Here's the type that had that output

typedef struct {
    PyObject_VAR_HEAD
    uint8_t data[1];
} BitStream;

static PyTypeObject BitStreamType = {
    PyVarObject_HEAD_INIT(NULL, 0)
    .tp_name = "elias_gamma_code.BitStream",
    .tp_dealloc = 0,
    .tp_free = PyObject_Del,
    .tp_basicsize = offsetof(BitStream, data),
    .tp_itemsize = sizeof(uint8_t),
    .tp_flags = Py_TPFLAGS_DEFAULT,
};

So there's something special about using PyLongObject in particular that causes the massive memory usage.

My guess was that it was due to PyObject_Realloc, so I asked over here: Use PyObject_Realloc correctly

But apparently

if a non-null pointer was returned, then obj was already freed

And indeed using PyObject_Free(previous_obj) caused pointer being freed was not allocated malloc errors.

Elias gamma encoding represents a non-zero positive binary integer as N leading zeroes, N being the bit length of the number - 1 (same as floor(log2(num))), followed by the number itself.

+--------+---------------+
| Number |  γ encoding   |
+--------+---------------+
|      1 |             1 |
|      2 |          0 10 |
|      3 |          0 11 |
|      4 |        00 100 |
|      5 |        00 101 |
|    ... |               |
|     15 |      000 1111 |
|     16 |   0000 1 0000 |
|    ... |               |
|     31 |   0000 1 1111 |
|     32 | 00000 10 0000 |

All numbers in a list can be encoded as such and concatenated. The resulting bitstream can be decoded unambiguously to the original list.

Antti Haapala -- Слава Україні · Accepted Answer · 2021-03-24T07:23:51.497

In error paths, given

PyLongObject *bstream = _PyLong_New(size);

you must decrease reference to it. But you didn't do that for example here:

if (PyErr_Occurred()) { // number overflowed or was negative
    PyErr_Format(PyExc_ValueError, "Number at list index %zd was negative or too large", i);
    return NULL;
} else if (number == 0) { // unencodable
    PyErr_Format(PyExc_ValueError, "Zero at list index %zd", i);
    return NULL;
}

The proper C idiom is to goto to an error handling label:

    PyLongObject *bstream = NULL;

    if (!(bstream = _PyLong_New(size))) {
        goto error;
    }

    if (! some other check) {
        goto error;
    }
    
    ...
    return bstream;

error:
    // xdecref requires that the pointer is initialized in all
    // code paths, either to null or pointing to a valid live PyObject 

    Py_XDECREF(bstream);
    return NULL;
}

Another thing altogether is the question whether it is allowed to realloc a longobject at all in Python... it can break in the future.

Finally this one looks a bit strange:

size * _bits_per_digit

PyObject_Realloc takes the new size as bytes, so why'd you multiply by bits per digit - instead you must multiply by width (sizeof) of the digit type.

It was entirely due to the `size * _bits_per_digit` you spotted! Thanks. And thanks a lot for pointing out the idiomatic style for errors, where should I put that label? (After the return at the end of the method?) — theonlygusti, Mar 23 '21 at 14:48
Often the way to do it is: 1) set up all the `PyObject*` to `NULL` at the start of the function. At the end of the function have `error: xdecref all pyobjects but result; return result;` Therefore the same cleanup code is shared between the success and failure path (but `result` is `NULL` on failure). — DavidW, Mar 23 '21 at 18:23

PyLongObject memory leak

1 Answers1

Linked