Cython 1D normalized slidding cross correlation Optimization

Question

I have the following code which does a normalized cross correlation looking for similarities in two signals in python:

def normcorr(template,srchspace):
template=(template-np.mean(template))/(np.std(template)*len(template)) # Normalize template
CCnorm=srchspace.copy()
CCnorm=CCnorm[np.shape(template)[0]:] # trim CC matrix
for a in range(len(CCnorm)):
    s=srchspace[a:a+np.shape(template)[0]]
    sp=(s-np.mean(s))/np.std(s)
    CCnorm[a]=numpy.sum(numpy.multiply(template,sp))
return CCnorm

but as you can imagine it is far too slow. Looking at the cython documentation, large increases in speed are promised when performing loops in raw python. So I attempted to write some cython code with data typing of the variables that looks like this:

from __future__ import division
import numpy as np
import math as m
cimport numpy as np
cimport cython
def normcorr(np.ndarray[np.float32_t, ndim=1] template,np.ndarray[np.float32_t, ndim=1]  srchspace):
    cdef int a
    cdef np.ndarray[np.float32_t, ndim=1] s
    cdef np.ndarray[np.float32_t, ndim=1] sp
    cdef np.ndarray[np.float32_t, ndim=1] CCnorm
    template=(template-np.mean(template))/(np.std(template)*len(template))
    CCnorm=srchspace.copy()
    CCnorm=CCnorm[len(template):]
    for a in range(len(CCnorm)):
        s=srchspace[a:a+len(template)]
        sp=(s-np.mean(s))/np.std(s)
        CCnorm[a]=np.sum(np.multiply(template,sp))
    return CCnorm

but once I compile it the code actually runs slower than the pure python code. I found here (How to call numpy/scipy C functions from Cython directly, without Python call overhead?) that calling numpy from cython might significantly slow down the code, is this the issue for my code, in which case I have to define inline functions to replace all calls to np, or is there something else I am doing wrong that I am missing?

score 3 · Accepted Answer · answered May 17 '14 at 01:07

Because you call numpy functions in a cython loop, there will be no speed improvement.

If you use pandas, you can use roll_mean() and roll_std() and convolve() in numpy to do the calculation very fast, here is the code:

import numpy as np
import pandas as pd

np.random.seed()

def normcorr(template,srchspace):
    template=(template-np.mean(template))/(np.std(template)*len(template)) # Normalize template
    CCnorm=srchspace.copy()
    CCnorm=CCnorm[np.shape(template)[0]:] # trim CC matrix
    for a in range(len(CCnorm)):
        s=srchspace[a:a+np.shape(template)[0]]
        sp=(s-np.mean(s))/np.std(s)
        CCnorm[a]=np.sum(np.multiply(template,sp))
    return CCnorm

def fast_normcorr(t, s):
    n = len(t)
    nt = (t-np.mean(t))/(np.std(t)*n)
    sum_nt = nt.sum()
    a = pd.rolling_mean(s, n)[n-1:-1]
    b = pd.rolling_std(s, n)[n-1:-1]
    b *= np.sqrt((n-1.0) / n)
    c = np.convolve(nt[::-1], s, mode="valid")[:-1]
    result = (c - sum_nt * a) / b    
    return result

n = 100
m = 1000
t = np.random.rand(n)
s = np.random.rand(m)

r1 = normcorr(t, s)
r2 = fast_normcorr(t, s)
assert np.allclose(r1, r2)

You can check the result r1 and r2 is same. And here is timeit test:

%timeit normcorr(t, s)
%timeit fast_normcorr(t, s)

the output:

10 loops, best of 3: 59 ms per loop
1000 loops, best of 3: 273 µs per loop

It's 200x faster.

This does exactly what I was looking for, and with much simpler code. I cannot thank you enough — derchambers, May 17 '14 at 03:01

score 1 · Answer 2 · answered May 17 '14 at 12:19

If you compile your code with cython -a and look at the HTML output, you will see that you have a lot of Python overhead.

@cython.boundscheck(False)
@cython.cdivision(True)   # Don't check for divisions by 0
def normcorr(np.ndarray[np.float32_t, ndim=1] template,np.ndarray[np.float32_t, ndim=1]  srchspace):
    cdef int a
    cdef int N = template.shape[0]
    cdef NCC = srchspace.shape[0] - N
    cdef np.ndarray[np.float32_t, ndim=1] s
    cdef np.ndarray[np.float32_t, ndim=1] sp
    cdef np.ndarray[np.float32_t, ndim=1] CCnorm
    template=(template - template.mean()) / (template.std() * N)
    CCnorm=srchspace[N:].copy()    # You don't need to copy the whole array
    for a in xrange(NCC):  # Use xrange in Python2
        s=srchspace[a:a+N]
        sp=(s-np.mean(s)) / np.std(s)
        CCnorm[a]= (template * sp).sum()
    return CCnorm

To improve performance, you can optimise the last two lines:

@cython.boundscheck(False)
@cython.cdivision(True)
cdef multiply_by_normalised(np.ndarray[np.float32_t, ndim=1] template, np.ndarray[np.float32_t, ndim=1] s):
    cdef int i
    cdef int N = template.shape[0]
    cdef float_32_t mean, std, out = 0

    mean = s.mean()
    std = s.std()

    for i in xrange(N):
        out += (s[i] - mean) / std * template[i]
    return out

If you still need to squeeze more time, you can use bottleneck's mean and std functions, that are faster than Numpy.

Cython 1D normalized slidding cross correlation Optimization

2 Answers2