Cython Gibbs sampler slightly slower than numpy one

Question

I have implemented a Gibbs sampler to generate textured images. According to the beta parameters (array of shape(4)), we can generate various textures.

Here is my initial function using Numpy:

def gibbs_sampler(img_label, betas, burnin, nb_samples):
    nb_iter = burnin + nb_samples

    lst_samples = []

    labels = np.unique(img)

    M, N = img.shape
    img_flat = img.flatten()

    # build neighborhood array by means of numpy broadcasting:
    m, n = np.ogrid[0:M, 0:N]

    top_left, top, top_right =   m[0:-2, :]*N + n[:, 0:-2], m[0:-2, :]*N + n[:, 1:-1]  , m[0:-2, :]*N + n[:, 2:]
    left, pix, right = m[1:-1, :]*N + n[:, 0:-2],  m[1:-1, :]*N + n[:, 1:-1], m[1:-1, :]*N + n[:, 2:]
    bottom_left, bottom, bottom_right = m[2:, :]*N + n[:, 0:-2],  m[2:, :]*N + n[:, 1:-1], m[2:, :]*N + n[:, 2:]

    mat_neigh = np.dstack([pix, top, bottom, left, right, top_left, bottom_right, bottom_left, top_right])

    mat_neigh = mat_neigh.reshape((-1, 9))    
    ind = np.arange((M-2)*(N-2))  

    # loop over iterations
    for iteration in np.arange(nb_iter):

        np.random.shuffle(ind)

        # loop over pixels
        for i in ind:                  

            truc = map(functools.partial(lambda label, img_flat, mat_neigh : 1-np.equal(label, img_flat[mat_neigh[i, 1:]]).astype(np.uint), img_flat=img_flat, mat_neigh=mat_neigh), labels)
            # bidule is of shape (4, 2, labels.size)
            bidule = np.array(truc).T.reshape((-1, 2, labels.size))

            # theta is of shape (labels.size, 4) 
            theta = np.sum(bidule, axis=1).T
            # prior is thus an array of shape (labels.size)
            prior = np.exp(-np.dot(theta, betas))

            # sample from the posterior
            drawn_label = np.random.choice(labels, p=prior/np.sum(prior))

            img_flat[(i//(N-2) + 1)*N + i%(N-2) + 1] = drawn_label


        if iteration >= burnin:
            print('Iteration %i --> sample' % iteration)
            lst_samples.append(copy.copy(img_flat.reshape(M, N)))

        else:
            print('Iteration %i --> burnin' % iteration)

    return lst_samples

We cannot get rid of any loop as it is an iterative algorithm. I have thus tried to speed it up by using Cython (with static typing):

from __future__ import division
import numpy as np
import copy
cimport numpy as np
import functools
cimport cython

INTTYPE = np.int
DOUBLETYPE = np.double

ctypedef np.int_t INTTYPE_t
ctypedef  np.double_t DOUBLETYPE_t

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)


def func_for_map(label, img_flat,  mat_neigh, i):

   return  (1-np.equal(label, img_flat[mat_neigh[i, 1:]])).astype(INTTYPE)


def gibbs_sampler(np.ndarray[INTTYPE_t, ndim=2] img_label, np.ndarray[DOUBLETYPE_t, ndim=1] betas, INTTYPE_t burnin=5, INTTYPE_t nb_samples=1):


    assert img_label.dtype == INTTYPE and betas.dtype== DOUBLETYPE

    cdef unsigned int nb_iter = burnin + nb_samples 

    lst_samples = list()

    cdef np.ndarray[INTTYPE_t, ndim=1] labels
    labels = np.unique(img_label)

    cdef unsigned int M, N
    M = img_label.shape[0]
    N = img_label.shape[1]

    cdef np.ndarray[INTTYPE_t, ndim=1] ind     
    ind = np.arange((M-2)*(N-2), dtype=INTTYPE)

    cdef np.ndarray[INTTYPE_t, ndim=1] img_flat
    img_flat = img_label.flatten()


    # build neighborhood array:
    cdef np.ndarray[INTTYPE_t, ndim=2] m
    cdef np.ndarray[INTTYPE_t, ndim=2] n


    m = (np.ogrid[0:M, 0:N][0]).astype(INTTYPE)
    n = (np.ogrid[0:M, 0:N][1]).astype(INTTYPE)



    cdef np.ndarray[INTTYPE_t, ndim=2] top_left, top, top_right, left, pix, right, bottom_left, bottom, bottom_right

    top_left, top, top_right =   m[0:-2, :]*N + n[:, 0:-2], m[0:-2, :]*N + n[:, 1:-1]  , m[0:-2, :]*N + n[:, 2:]
    left, pix, right = m[1:-1, :]*N + n[:, 0:-2],  m[1:-1, :]*N + n[:, 1:-1], m[1:-1, :]*N + n[:, 2:]
    bottom_left, bottom, bottom_right = m[2:, :]*N + n[:, 0:-2],  m[2:, :]*N + n[:, 1:-1], m[2:, :]*N + n[:, 2:]

    cdef np.ndarray[INTTYPE_t, ndim=3] mat_neigh_init
    mat_neigh_init = np.dstack([pix, top, bottom, left, right, top_left, bottom_right, bottom_left, top_right])

    cdef np.ndarray[INTTYPE_t, ndim=2] mat_neigh
    mat_neigh = mat_neigh_init.reshape((-1, 9))    

    cdef unsigned int i
    truc = list()
    cdef np.ndarray[INTTYPE_t, ndim=3] bidule
    cdef np.ndarray[INTTYPE_t, ndim=2] theta
    cdef np.ndarray[DOUBLETYPE_t, ndim=1] prior
    cdef unsigned int drawn_label, iteration       



    # loop over ICE iterations
    for iteration in np.arange(nb_iter):

        np.random.shuffle(ind) 

        # loop over pixels        
        for i in ind:            

            truc = map(functools.partial(func_for_map, img_flat=img_flat, mat_neigh=mat_neigh, i=i), labels)                        


            bidule = np.array(truc).T.reshape((-1, 2, labels.size)).astype(INTTYPE)            


            theta = np.sum(bidule, axis=1).T

            # ok so far

            prior = np.exp(-np.dot(theta, betas)).astype(DOUBLETYPE)
#            print('ok after prior') 
#            return 0
            # sample from the posterior
            drawn_label = np.random.choice(labels, p=prior/np.sum(prior))

            img_flat[(i//(N-2) + 1)*N + i%(N-2) + 1] = drawn_label


        if iteration >= burnin:
            print('Iteration %i --> sample' % iteration)
            lst_samples.append(copy.copy(img_flat.reshape(M, N)))

        else:
            print('Iteration %i --> burnin' % iteration)   



    return lst_samples

However, I ended up with the almost the same computation time, the numpy version being slightly faster than the Cython one.

I am thus trying to improve the Cython code.

EDIT:

For both functions (Cython and no Cython): I have replaced:

truc = map(functools.partial(lambda label, img_flat, mat_neigh : 1-np.equal(label, img_flat[mat_neigh[i, 1:]]).astype(np.uint), img_flat=img_flat, mat_neigh=mat_neigh), labels)

by a broadcasting:

truc = 1-np.equal(labels[:, None], img_flat[mat_neigh[i, 1:]][None, :])

all np.arange by range, and the computation of prior is now done by means of np.einsum as suggested by Divakar.

Both functions are faster than previously, but the Python one is still slightly faster than the Cython one.

bidule is of shape (4, 2, labels.size) where labels.size is the number of different labels in the image. theta is of shape (labels.size, 4). — floflo29, Oct 25 '16 at 17:44
Comments from this other recent `cython` question apply here, http://stackoverflow.com/questions/40233664/cython-actually-slowing-me-down. — hpaulj, Oct 25 '16 at 21:32
I don't think this is worth an answer, but I think you're calling `np.exp` on a single value. The overhead of calling it is large, so try using `exp` from the c math library instead. — DavidW, Oct 26 '16 at 06:37
It may be possible to reduce the complexity from O(n_iter * n_pixels * n_labels) to O(n_iter * n_pixels * log(n_labels)) if you look up the 8 neigboring pixels in the `labels` array with binary search. Maybe. It would require a tricky custom way of `random.choice` and would only be worthwhile for large n_labels... — user6758673, Oct 27 '16 at 13:44

Ami Tavory · Answer 1 · 2016-10-25T18:51:39.097

3

I've run Cython in annotated mode on your source, and viewed the result. That is, having saved it in q.pyx, I ran

cython -a q.pyx
firefox q.html

(of course, use any browser you want).

The code is colored deep yellow, indicating that, as far as Cython is concerned, the code is far from being statically typed. AFAICT, this falls into two categories.

In some cases, you could better statically type your code:

In for iteration in np.arange(nb_iter): and for i in ind:, you're paying for about 30 C lines per iteration. See here how to efficiently access numpy arrays in Cython.
In truc = map(functools.partial(func_for_map, img_flat=img_flat, mat_neigh=mat_neigh, i=i), labels), you're not really getting any benefit from static typing. I suggest you cdef the function func_for_map, and call it yourself in a loop.

In other cases, you're calling numpy vectorized functions, e.g., theta = np.sum(bidule, axis=1).T, prior = np.exp(-np.dot(theta, betas)).astype(DOUBLETYPE), etc. In these cases, Cython really doesn't have much of a benefit.

edited Oct 25 '16 at 18:51

answered Oct 25 '16 at 18:40

Ami Tavory

74,578
11
141
185

I agree for cythonizing 'func_for_map' but in this case you would get rid of the 'map' call? However, I think I have done all the steps suggested in the link you've provided to improve my code, no? – floflo29 Oct 25 '16 at 20:56
@floflo29 Annotated mode Cython indicates that no, AFAIU. I suggest you try it - it actually shows the C code generated. As I wrote, each of the loops generates around 30 LOCS, some of which is pretty heavy function calls. – Ami Tavory Oct 25 '16 at 22:20
The only way to speed it up would thus be to rewrite numpy functions in Cython/C? – floflo29 Oct 26 '16 at 07:54
@floflo29 numpy functions *are* written in Cython/C (or at least effectively so). – Ami Tavory Oct 26 '16 at 09:46
Yes I know that, but why Cython tells me that my code (especially when calling Numpy functions) can be improved? – floflo29 Oct 26 '16 at 09:50
@floflo29 Don't rewrite the numpy functions - they're generally pretty good. I think the gist of the this answer is you should do your all loops in cython as `for idx in range(length):` and avoid the Python iteration mechanisms (e.g. map, or the general `for i in iterable:` style loop) – DavidW Oct 26 '16 at 10:06
@DavidW I completely agree. – Ami Tavory Oct 26 '16 at 10:33
I should thus replace `for i in np.arange(nb_iter)` by `for i in range(nb_iter)`? And instead of using `map` it is preferable to explicitly write a `for` loop to replace it? However, this does not explain why Cython tells me that the lines calling numpy are not statically typed... – floflo29 Oct 26 '16 at 15:36
@floflo29 Yes - replace `np.arange` with `range` - Cython translates this directly to a for loop. Yes, replace `map`. The numpy lines aren't statically typed so there's certainly a little bit of overhead in calling them. However they're quite efficient when called so I wouldn't worry about them as a priority. – DavidW Oct 26 '16 at 17:58
As edited in my first post, replacing `map` by broadcasting and `np.arange` by `range` speed both functions up but Cython is still slightly slower. – floflo29 Oct 27 '16 at 08:51

score 2 · Answer 2 · answered Oct 25 '16 at 18:12

If you are looking to speedup the NumPy code, we can improve the performance inside the innermost loop and hopefully that should translate to some overall improvement.

So, there we have :

theta = np.sum(bidule, axis=1).T
prior = np.exp(-np.dot(theta, betas))

Combining the summation and matrix-multiplication in one step, we would have -

np.dot(np.sum(bidule, axis=1).T, betas)

Now, this involves summing along an axis and then sum-reduction after element-wise multiplication. Among many tools, we have np.einsum to help us out here, especially because we could perform those reductions in one go, like so -

np.einsum('ijk,i->k',bidule,betas)

Runtime test -

In [98]: # Setup
    ...: N = 100
    ...: bidule = np.random.rand(4,2,N)
    ...: betas = np.random.rand(4)
    ...: 

In [99]: %timeit np.dot(np.sum(bidule, axis=1).T, betas)
100000 loops, best of 3: 12.4 µs per loop

In [100]: %timeit np.einsum('ijk,i->k',bidule,betas)
100000 loops, best of 3: 4.05 µs per loop

In [101]: # Setup
     ...: N = 10000
     ...: bidule = np.random.rand(4,2,N)
     ...: betas = np.random.rand(4)
     ...: 

In [102]: %timeit np.dot(np.sum(bidule, axis=1).T, betas)
10000 loops, best of 3: 157 µs per loop

In [103]: %timeit np.einsum('ijk,i->k',bidule,betas)
10000 loops, best of 3: 90.9 µs per loop

So, hopefully when run with many iterations, the speedup would be noticeable.

Ok, I will try it as soon as possible. Would you have any idea to speed up the Cython code? — floflo29, Oct 25 '16 at 18:16
Don't expect too much from CodeReview. Most of the `numpy` experts hang around here. There's probably more `cython` knowledge here as well. CR is good for making sure your Python code is PEP8 compliant, but not that good for major speedups. Also you don't provide a complete testable script, just one function. — hpaulj, Oct 25 '16 at 20:22
Look at the `cython` tag here and on CR. Here 2k questions, there, 30! — hpaulj, Oct 25 '16 at 20:32

score 1 · Accepted Answer · edited May 23 '17 at 12:19

This answer does a good job of explaining why Numpy can be inefficient and you still want to use Cython. Basically:

Overhead for small arrays (also reducing a small dimension like your np.sum(bidule, axis=1);
Cache thrashing for large arrays due to intermediaries.

In this case, to benefit from Cython you have to replace Numpy array operations with plain Python loops - Cython has to be able to translate it to C code, else there is no point. This doesn't mean you must rewrite all the Numpy functions, you'll have to be intelligent about it.

For example you should get rid of the mat_neigh and bidule arrays and just index and sum as you go in a loop.

On the other hand, you should keep the (normalized) prior array and keep using np.random.choice. There's not really an easy way around that (well.. see source for choice). Unfortunately this means this part will probably be the performance bottleneck.

Cython Gibbs sampler slightly slower than numpy one

3 Answers3