I have problems passing the right parameters to the prepare
function (and to the prepared_call) to allocate of shared memory in PyCUDA. I understand the error message in this way, that one of the variables I pass to PyCUDA is a long
instead of what I intended float32
. But I cannot see, where the variable comes from.
Furthermore does it seem to me, that the official example and the documentation of prepare
contradict each other regarding if block
needs to be None
or not.
from pycuda import driver, compiler, gpuarray, tools
import pycuda.autoinit
import numpy as np
kernel_code ="""
__device__ void loadVector(float *target, float* source, int dimensions )
{
for( int i = 0; i < dimensions; i++ ) target[i] = source[i];
}
__global__ void kernel(float* data, int dimensions, float* debug)
{
extern __shared__ float mean[];
if(threadIdx.x == 0) loadVector( mean, &data[0], dimensions );
debug[threadIdx.x]= mean[threadIdx.x];
}
"""
dimensions = 12
np.random.seed(23)
data = np.random.randn(dimensions).astype(np.float32)
data_gpu = gpuarray.to_gpu(data)
debug = gpuarray.zeros(dimensions, dtype=np.float32)
mod = compiler.SourceModule(kernel_code)
kernel = mod.get_function("kernel")
kernel.prepare("PiP",block = (dimensions, 1, 1),shared=data.size)
grid = (1,1)
kernel.prepared_call(grid,data_gpu,dimensions,debug)
print debug.get()
Output
Traceback (most recent call last):
File "shared_memory_minimal_example.py", line 28, in <module>
kernel.prepared_call(grid,data_gpu,dimensions,debug)
File "/usr/local/lib/python2.6/dist-packages/pycuda-0.94.2-py2.6-linux-x86_64.egg/pycuda/driver.py", line 230, in function_prepared_call
func.param_setv(0, pack(func.arg_format, *args))
pycuda._pvt_struct.error: cannot convert argument to long