I need to perform a calculation on two multidimensional arrays, where components are convolved with each other. I have written a normal Python function to do this on nested loops (which is supposed to be very slow) and the same one using Numba, which is supposed to be more efficient for these nested loop implementations. I think I am doing things very wrong because the Numba implementation of the function takes even longer (much longer) than the normal Python function; here is the code example:
import numpy as np
from numba import jit
@jit( nopython=True )
def convolve_arrays_jit( A, B, T, dx, md, sd, M, mid_indices ):
a_dtype = A.dtype
ms = 2*M
mt = 4*M
N = B.shape[1]
mid_N = int( N/2 )
res = np.zeros( N , a_dtype )
for m1 in range( md ):
m1_index = m1 + md
for m2 in range( md ):
m2_index = m2 + md
for gs in range( 9 ):
aindx = mid_indices[ gs, 0 ]
bindx = mid_indices[ gs, 1 ]
As = A[ aindx ]
Bs = B[ bindx ]
for i in range( sd ):
for j in range( sd ):
for k in range( sd ):
for l in range( sd ):
for n in range( 0, mt ):
a_sel = As[ :, - m1_index - n + ms, -m1_index, i , j ]
b_sel = Bs[ :, m2_index + n - ms , m2_index, k , l ]
res = dx*np.convolve( a_sel , b_sel )[ mid_N:mid_N + N ]
T[ m1,m2,gs,i,j,k,l, : ] += res
return T
def convolve_arrays( A, B, T, dx, N, md, sd, M, mid_indices ):
a_dtype = A.dtype
mid_N = int( N/2 )
ms = 2*M
mt = 4*M
res = np.zeros( N , a_dtype )
for m1 in range( md ):
m1_index = m1 + md
for m2 in range( md ):
m2_index = m2 + md
for gs in range( 9 ):
aindx = mid_indices[ gs, 0 ]
bindx = mid_indices[ gs, 1 ]
As = A[ aindx, :, :, :, :, : ]
Bs = B[ bindx, :, :, :, :, : ]
for i in range( sd ):
for j in range( sd ):
for k in range( sd ):
for l in range( sd ):
for n in range( 0, mt ):
res = ( dx*np.convolve( As[ :, - m1_index - n + ms, -m1_index, i , j ] , Bs[ :, m2_index + n - ms, m2_index, k, l ] ) )[ mid_N:mid_N + N ]
T[ m1,m2,gs,i,j,k,l, : ] += res
return T
N = 3**7
T1 = np.zeros( ( 3, 3, 9, 3, 3, 3, 3, N ), dtype=np.complex128 )
T2 = np.zeros( ( 3, 3, 9, 3, 3, 3, 3, N ), dtype=np.complex128 )
mid_indices = np.array( [ np.array( [ np.array( [ i,j ] ) for j in range(3)] ) for i in range(3) ] )
mid_indices = np.reshape( mid_indices, (9,2) )
A = np.random.rand( 3, N, 7,7,3,3 ) + 1j*np.random.rand( 3, N, 7,7,3,3 )
B = np.random.rand( 3, N, 7,7,3,3 ) + 1j*np.random.rand( 3, N, 7,7,3,3 )
dx = 0.01
from time import time
ts = time()
T1 = convolve_arrays( A, B, T1, dx, N, 3, 3, 1, mid_indices )
print( time() - ts )
T2 = convolve_arrays_jit( A, B, T2, dx, 3, 3, 1, mid_indices )
#use again
T2 = np.zeros( ( 3, 3, 9, 3, 3, 3, 3, N ), dtype=np.complex128 )
ts = time()
T2 = convolve_arrays_jit( A, B, T2, dx, 3, 3, 1, mid_indices )
print( time() - ts )
print( np.allclose( T1, T2 ) )
Why is this Numba implementation so slow? Any help would be very helpful, I am very new to using Numba; also any other suggested ways to perform the calculation in a more efficient way would be nice.
COMPARING SCIPY FFT-CONVOLVE WITH NUMBA
According to the Numba documentation, the function np.convolve
is supported only with the first two arguments. That means the convolution is done in "full" mode. So I have tried with this minimal code example and I still see Numba is way slower than the scipy implementation of fftconvolve:
import numpy as np
from numba import jit
from scipy.signal import fftconvolve
from time import time
@jit( nopython=True )
def conv_jitted( a, b, d, N ):
res = np.empty( (nd, N ), dtype=np.complex128 )
v = np.empty( N , dtype=np.complex128 )
for k in range( d ):
v = np.convolve( a[k], b[k] )
res[ k ] = v
return res
nd = 1000
N = 3**7
Nf = 2*N - 1
a = np.random.rand( nd, N ) + 1j*np.random.rand( nd, N )
b = np.random.rand( nd, N ) + 1j*np.random.rand( nd, N )
t1 = time()
res = fftconvolve( a, b, mode="full", axes=-1)
print( time()-t1 )
res2 = conv_jitted( a, b, nd, Nf )
t2 = time()
res2 = conv_jitted( a, b, nd, Nf )
print( time()-t2 )
print( np.allclose( res, res2 ) )
Both arrays end up being the same, but something is wrong with the Numba call in this code, can anyone see what is going on?