I am trying to write DLL function to allocate cuda memory and get back pointer to cuda (device) memory.
Second function should accept this pointer and do the calculation.
I want this operation to be separate because I need to do many calculations on the same data and I am trying to avoid repeatedly copying same data to GPU memory (it takeS a lot of time)-
Q: what do I need to add to my DLL to be able to export pointer to i_d
?
My DLL:
main.cpp:
extern "C" __declspec(dllexport) int cuda_Malloc ( float *i, void **i_d, int N ){
for( float x=0; x<N; x++ )
i[x]=x;
kernel_cuda_Malloc( i, i_d, N );
return 0;
}
extern "C" __declspec(dllexport) int cuda_Calculation( void *i_d, float *result, int N ) {
kernel_cuda_calculation( i_d, result, N );
return 0;
}
simple.cu:
__global__ void kernelTest( float *i, int N ){
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
if ( tid<N )
i[tid] += 10;
}
int kernel_cuda_Malloc( float *i, void **i_d, int N ){
cudaMalloc( (void**)&i_d, N*sizeof( float ) );
cudaMemcpy( i_d, i, N*sizeof( float ), cudaMemcpyHostToDevice );
return 0;
}
void kernel_cuda_calculation( float *i_d, float *result, int N ){
dim3 threads; threads.x = 240;
dim3 blocks; blocks.x = ( N/threads.x ) + 1;
kernelTest<<< threads, blocks >>>( i_d, N );
cudaMemcpy( result, i_d, N*sizeof( float ), cudaMemcpyDeviceToHost );
cudaFree( i_d );
}
I am not able to get out pointer to i_d
from cuda_Malloc
function in LabVIEW.
Code is modification of https://decibel.ni.com/content/docs/DOC-20353