I was a little surprised to find that:
# fast_ops_c.pyx
cimport cython
cimport numpy as np
@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False) # turn off negative index wrapping for entire function
@cython.nonecheck(False)
def c_iseq_f1(np.ndarray[np.double_t, ndim=1, cast=False] x, double val):
# Test (x==val) except gives NaN where x is NaN
cdef np.ndarray[np.double_t, ndim=1] result = np.empty_like(x)
cdef size_t i = 0
cdef double _x = 0
for i in range(len(x)):
_x = x[i]
result[i] = (_x-_x) + (_x==val)
return result
is orders or magnitude faster than:
@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False) # turn off negative index wrapping for entire function
@cython.nonecheck(False)
def c_iseq_f2(np.ndarray[np.double_t, ndim=1, cast=False] x, double val):
cdef np.ndarray[np.double_t, ndim=1] result = np.empty_like(x)
cdef size_t i = 0
cdef double _x = 0
for _x in x: # Iterate over elements
result[i] = (_x-_x) + (_x==val)
return result
(for large arrays). I'm using the following to test the performance:
# fast_ops.py
try:
import pyximport
pyximport.install(setup_args={"include_dirs": np.get_include()}, reload_support=True)
except Exception:
pass
from fast_ops_c import *
import math
import nump as np
NAN = float("nan")
import unittest
class FastOpsTest(unittest.TestCase):
def test_eq_speed(self):
from timeit import timeit
a = np.random.random(500000)
a[1] = 2.
a[2] = NAN
a2 = c_iseq_f(a, 2.)
def f1(): c_iseq_f2(a, 2.)
def f2(): c_iseq_f1(a, 2.)
# warm up
[f1() for x in range(20)]
[f2() for x in range(20)]
n=1000
dur = timeit(f1, number=n)
print dur, "DUR1 s/iter", dur/n
dur = timeit(f2, number=n)
print dur, "DUR2 s/iter", dur/n
dur = timeit(f1, number=n)
print dur, "DUR1 s/iter", dur/n
assert dur/n <= 0.005
dur = timeit(f2, number=n)
print dur, "DUR2 s/iter", dur/n
print a2[:10]
assert a2[0] == 0.
assert a2[1] == 1.
assert math.isnan(a2[2])
I'm guessing that for _x in x
is interpreted as execute the python iterator for x, and for i in range(n):
is interpreted as a C for loop, and x[i]
interpreted as C's x[i]
array indexing.
However, I'm kinda guessing and trying to follow by example. In its working with numpy (and here) docs, Cython is a little quiet on what's optimized with respect to numpy, and what's not. Is there a guide to what is optimized.
Similarly, the following, which assumes contiguous array memory, is considerably faster that either of the above.
@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False) # turn off negative index wrapping for entire function
def c_iseq_f(np.ndarray[np.double_t, ndim=1, cast=False, mode="c"] x not None, double val):
cdef np.ndarray[np.double_t, ndim=1] result = np.empty_like(x)
cdef size_t i = 0
cdef double* _xp = &x[0]
cdef double* _resultp = &result[0]
for i in range(len(x)):
_x = _xp[i]
_resultp[i] = (_x-_x) + (_x==val)
return result