I wrote the following CUDA kernel and am trying to load it into a module:
#include <stdio.h>
extern "C" // ensure function name to be exactly "vadd"
{
__global__ void vadd(const float *a, const float *b, float *c)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
printf("Thread id %d\n", i);
c[i] = a[i] + b[i];
}
}
I compile it to ptx code using the following command:
nvcc -ptx -arch=sm_20 vadd.cu
When trying to load this file into a module using cuModuleLoad
I get a CUDA 200 error (invalid kernel image). How can I find out what is wrong with the kernel image? I have tried ptxas
, but according to that, the generated ptx code is fine.
Edit: This is the code I am using to load the module:
#include "cuda.h"
#include <cassert>
#include <dlfcn.h>
#include <stdio.h>
void check(CUresult err) {
if (err != CUDA_SUCCESS) {
printf("Error %i\n", err);
}
assert(err == CUDA_SUCCESS);
}
int main(int argc, char **argv) {
void *cuda = dlopen("libcuda.so", RTLD_NOW | RTLD_DEEPBIND | RTLD_GLOBAL);
assert(cuda != NULL);
printf("cuInit\n");
CUresult (*Init)() = (CUresult (*)()) dlsym(cuda, "cuInit");
check(Init());
printf("cuDeviceGet\n");
CUresult (*DeviceGet)(CUdevice *, int) = (CUresult (*)(CUdevice *, int)) dlsym(cuda, "cuDeviceGet");
CUdevice device;
check(DeviceGet(&device, 0));
printf("cuCtxCreate\n");
CUresult (*CtxCreate)(CUcontext * , unsigned int, CUdevice) = (CUresult (*)(CUcontext * , unsigned int, CUdevice)) dlsym(cuda, "cuCtxCreate");
CUcontext context;
check(CtxCreate(&context, 0, device));
printf("cuModuleLoad\n");
CUresult (*ModuleLoad)(CUmodule *, const char*) = (CUresult (*)(CUmodule *, const char*)) dlsym(cuda, "cuModuleLoad");
CUmodule mod;
check(ModuleLoad(&mod, "vadd.ptx"));
return 0;
}