Probably the best way to go is to write this bit in Cython (name the file cumsum_eps.pyx):
cimport numpy as cnp
import numpy as np
cdef inline _cumsum_eps_f4(float *A, int ndim, int dims[], float *out, float eps):
cdef float sum
cdef size_t ofs
N = 1
for i in xrange(0, ndim - 1):
N *= dims[i]
ofs = 0
for i in xrange(0, N):
sum = 0
for k in xrange(0, dims[ndim-1]):
sum += A[ofs]
if abs(sum) < eps:
sum = 0
out[ofs] = sum
ofs += 1
def cumsum_eps_f4(cnp.ndarray[cnp.float32_t, mode='c'] A, shape, float eps):
cdef cnp.ndarray[cnp.float32_t] _out
cdef cnp.ndarray[cnp.int_t] _shape
N = np.prod(shape)
out = np.zeros(N, dtype=np.float32)
_out = <cnp.ndarray[cnp.float32_t]> out
_shape = <cnp.ndarray[cnp.int_t]> np.array(shape, dtype=np.int)
_cumsum_eps_f4(&A[0], len(shape), <int*> &_shape[0], &_out[0], eps)
return out.reshape(shape)
def cumsum_eps(A, axis=None, eps=np.finfo('float').eps):
A = np.array(A)
if axis is None:
A = np.ravel(A)
else:
axes = list(xrange(len(A.shape)))
axes[axis], axes[-1] = axes[-1], axes[axis]
A = np.transpose(A, axes)
if A.dtype == np.float32:
out = cumsum_eps_f4(np.ravel(np.ascontiguousarray(A)), A.shape, eps)
else:
raise ValueError('Unsupported dtype')
if axis is not None: out = np.transpose(out, axes)
return out
then you can compile it like this (Windows, Visual C++ 2008 Command Line):
\Python27\Scripts\cython.exe cumsum_eps.pyx
cl /c cumsum_eps.c /IC:\Python27\include /IC:\Python27\Lib\site-packages\numpy\core\include
F:\Users\sadaszew\Downloads>link /dll cumsum_eps.obj C:\Python27\libs\python27.lib /OUT:cumsum_eps.pyd
or like this (Linux use .so extension/Cygwin use .dll extension, gcc):
cython cumsum_eps.pyx
gcc -c cumsum_eps.c -o cumsum_eps.o -I/usr/include/python2.7 -I/usr/lib/python2.7/site-packages/numpy/core/include
gcc -shared cumsum_eps.o -o cumsum_eps.so -lpython2.7
and use like this:
from cumsum_eps import *
import numpy as np
x = np.array([[1,2,3,4], [5,6,7,8]], dtype=np.float32)
>>> print cumsum_eps(x)
[ 1. 3. 6. 10. 15. 21. 28. 36.]
>>> print cumsum_eps(x, axis=0)
[[ 1. 2. 3. 4.]
[ 6. 8. 10. 12.]]
>>> print cumsum_eps(x, axis=1)
[[ 1. 3. 6. 10.]
[ 5. 11. 18. 26.]]
>>> print cumsum_eps(x, axis=0, eps=1)
[[ 1. 2. 3. 4.]
[ 6. 8. 10. 12.]]
>>> print cumsum_eps(x, axis=0, eps=2)
[[ 0. 2. 3. 4.]
[ 5. 8. 10. 12.]]
>>> print cumsum_eps(x, axis=0, eps=3)
[[ 0. 0. 3. 4.]
[ 5. 6. 10. 12.]]
>>> print cumsum_eps(x, axis=0, eps=4)
[[ 0. 0. 0. 4.]
[ 5. 6. 7. 12.]]
>>> print cumsum_eps(x, axis=0, eps=8)
[[ 0. 0. 0. 0.]
[ 0. 0. 0. 8.]]
>>> print cumsum_eps(x, axis=1, eps=3)
[[ 0. 0. 3. 7.]
[ 5. 11. 18. 26.]]
and so on, of course normally eps would be some small value, here integers are used just for the sake of demonstration / easiness of typing.
If you need this for double as well the _f8 variants are trivial to write and another case has to be handled in cumsum_eps().
When you're happy with the implementation you should make it a proper part of your setup.py - Cython setup.py
Update #1: If you have good compiler support in run environment you could try [Theano][3] to implement either compensation algorithm or your original idea:
import numpy as np
import theano
import theano.tensor as T
from theano.ifelse import ifelse
A=T.vector('A')
sum=T.as_tensor_variable(np.asarray(0, dtype=np.float64))
res, upd=theano.scan(fn=lambda cur_sum, val: ifelse(T.lt(cur_sum+val, 1.0), np.asarray(0, dtype=np.float64), cur_sum+val), outputs_info=sum, sequences=A)
f=theano.function(inputs=[A], outputs=res)
f([0.9, 2, 3, 4])
will give [0 2 3 4] output. In either Cython or this you get at least +/- performance of the native code.