Pass scalar values by reference using Numba
To get useful timings I have modified the wrapped function a bit. The function simply adds a scalar (passed by value) to a scalar b (passed by reference).
Pros and cons of the approach using intrinsics
- Only working in nopython mode
- Faster for C or Fortran functions with short runtime (real-world example)
Example function
import cffi
ffi = cffi.FFI()
defs="void foo_f(double a,double *b);"
ffi.cdef(defs, override=True)
source="""
void foo_f(double a,double *b){
b[0]+=a;
}
"""
ffi.set_source(module_name="foo",source=source)
ffi.compile()
Wrapper using a temporary array
This is quite straight forward, but requires to allocate an array of size one, which is quite slow.
import numpy as np
import numba as nb
from numba import cffi_support
import cffi
ffi = cffi.FFI()
import foo
nb.cffi_support.register_module(foo)
foo_f = foo.lib.foo_f
@nb.njit("float64(float64,float64)")
def method_using_arrays(a,b):
b_arr=np.empty(1,dtype=np.float64)
b_arr[0]=b
b_arr_ptr=b_wrap=ffi.from_buffer(b_arr)
foo_f(a,b_arr_ptr)
return b_arr[0]
Wrapper using intrinsics
from numba import types
from numba.extending import intrinsic
from numba import cgutils
@intrinsic
def ptr_from_val(typingctx, data):
def impl(context, builder, signature, args):
ptr = cgutils.alloca_once_value(builder,args[0])
return ptr
sig = types.CPointer(data)(data)
return sig, impl
@intrinsic
def val_from_ptr(typingctx, data):
def impl(context, builder, signature, args):
val = builder.load(args[0])
return val
sig = data.dtype(data)
return sig, impl
@nb.njit("float64(float64,float64)")
def method_using_intrinsics(a,b):
b_ptr=ptr_from_val(b)
foo_f(a,b_ptr)
return val_from_ptr(b_ptr)
Timings
#Just call the wrapped function a few times
@nb.njit()
def timing_method_using_intrinsics(a,b):
for i in range(1000):
b=method_using_intrinsics(a,b)
return b
#Just call the wrapped function a few times
@nb.njit()
def timing_method_using_arrays(a,b):
for i in range(1000):
b=method_using_arrays(a,b)
return b
a=1.
b=1.
%timeit timing_method_using_intrinsics(a,b)
#5.15 µs ± 33.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%timeit timing_method_using_arrays(a,b)
#121 µs ± 601 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)