1

My CUDA Kernel, needs a lot of arrays which need to be passed as pointers to the kernel. The problem is that just before the kernel launch, all the pointers have valid addresses, moreover the cudaMalloc and cudaMemcpy calls always return cudaSuccess, but all these arguments become null once the kernel is launched!

I am clueless as to what is happening. This is what I get when I run my code with cuda-gdb

CUDA Exception: Device Illegal Address
The exception was triggered in device 0.

Program received signal CUDA_EXCEPTION_10, Device Illegal Address.
[Switching focus to CUDA kernel 0, grid 1, block (0,0,0), thread (64,0,0), device 0, sm 1, warp 2, lane 0]
0x00000000062a3dd8 in compute_data_and_match_kernel<<<(2,1,1),(512,1,1)>>> (a11=0x0, a12=0x0, a22=0x0, b1=0x0, b2=0x0, mask=0x0, wx=0x0, wy=0x0, du=0x0, dv=0x0, uu=0x0, 
    vv=0x0, Ix_c1=0x0, Ix_c2=0x0, Ix_c3=0x0, Iy_c1=0x0, Iy_c2=0x0, Iy_c3=0x0, Iz_c1=0x0, Iz_c2=0x0, Iz_c3=0x0, Ixx_c1=0x0, Ixx_c2=0x0, Ixx_c3=0x0, Ixy_c1=0x0, Ixy_c2=0x0, 
    Ixy_c3=0x0, Iyy_c1=0x0, Iyy_c2=0x0, Iyy_c3=0x0, Ixz_c1=0x0, Ixz_c2=0x0, Ixz_c3=0x0, Iyz_c1=0x0, Iyz_c2=0x0, Iyz_c3=0x0, desc_weight=0x0, desc_flow_x=0x0, 
    desc_flow_y=0x0, half_delta_over3=0.0833333358, half_beta=0, half_gamma_over3=0.833333313, width=59, height=26, stride=60) at opticalflow_aux.cu:441
441         ix_c1_val = Ix_c1[index]; iy_c1_val = Iy_c1[index]; iz_c1_val = Iz_c1[index];
(cuda-gdb) 

Is there something very obvious that I am missing. Thanks in advance.

EDIT 1 : As suggested by Gilles, I am trying to copy the host pointers and data into a struct and then onto device. For the sake of simplicity (MCVE) I am using only a single pointer inside struct:

#include <cuda.h>
#include <stdio.h>

typedef struct test {
    float *ptr;
} test_t;

__global__ void test_kernel(test_t *s) {
    s->ptr[0] = s->ptr[1] = s->ptr[2] = s->ptr[3] = s->ptr[4] = 100;
    s->ptr[5] = s->ptr[6] = s->ptr[7] = s->ptr[8] = s->ptr[9] = 100;
}

int main() {

    float arr[] = {0,1,2,3,4,5,6,7,8,9};

    test_t *h_struct;
    h_struct = (test_t *)malloc(sizeof(test_t));
    h_struct->ptr = arr;

    test_t *d_struct;
    float *d_data;
    cudaMalloc((void **)&d_struct, sizeof(test_t));
    cudaMalloc((void **)&d_data, sizeof(float)*10);

    // Copy the data from host to device
    cudaMemcpy(d_data, h_struct->ptr, sizeof(float)*10,   cudaMemcpyHostToDevice);
    // Point the host struct ptr to device memory
    h_struct->ptr = d_data;
    // copy the host struct to device
    cudaMemcpy(d_struct, h_struct, sizeof(test_t), cudaMemcpyHostToDevice);


    // Kernel Launch
    test_kernel<<<1,1>>>(d_struct);
    // copy the device array to host
    cudaMemcpy(h_struct->ptr, d_data, sizeof(float)*10, cudaMemcpyDeviceToHost);

    cudaFree(d_data);
    cudaFree(d_struct);

    // Verifying if all the values have been set to 100
    int i;
    for(i=0 ; i<10 ; i++)
        printf("%f\t", h_struct->ptr[i]);

    return 0;
}

When I am checking the value of d_struct->ptr, just before the kernel launch it shows me 0x0. (I have checked these values using nsight in debug mode)

Swaroop
  • 1,219
  • 3
  • 16
  • 32
  • I can update my question with actual code, if it is required as the code for kernel and kernel launch is really huge. – Swaroop Aug 28 '15 at 09:15
  • 1
    yes please, you should always provide a [mcve], so first find a minimal example which reproduces your problem and then post it here – m.s. Aug 28 '15 at 09:25
  • Your new example has completely broken host side data handling after the kernel runs. `h_struct->ptr` isn't a valid host pointer and you cannot use it as the destination in a device to host memory transfer or try printing its values. If I modify you code so that `arr` is used as the destination after the kernel, and print that out, it works as expect and prints out 100. Presumably you never actually ran this, because if you did, you would have gotten a host segfault – talonmies Aug 31 '15 at 10:39
  • @talonmies Thanks for pointing that out. It works just as u pointed out. Now I will extend this to multiple arguments to handle my previous scenario. But still before the kernel launch `d_struct->ptr ` is still `0x0` – Swaroop Aug 31 '15 at 10:48

1 Answers1

2

Not sure if it's the issue, but I believe the size of the stack for passing arguments to a kernel is limited. You might need to create a structure storing your arguments, copy it to the device and only pass a pointer to it as argument to your kernel. Then, inside the kernel you retrieve your arguments from the structure...


EDIT: Added a corrected version of the submitted code. This works for me and exemplifies the principle I described.

#include <cuda.h>
#include <stdio.h>

typedef struct test {
    float *ptr;
} test_t;

__global__ void test_kernel(test_t *s) {
    s->ptr[0] = s->ptr[1] = s->ptr[2] = s->ptr[3] = s->ptr[4] = 100;
    s->ptr[5] = s->ptr[6] = s->ptr[7] = s->ptr[8] = s->ptr[9] = 100;
}

int main() {

    float arr[] = {0,1,2,3,4,5,6,7,8,9};

    test_t *h_struct;
    h_struct = (test_t *)malloc(sizeof(test_t));

    test_t *d_struct;
    float *d_data;
    cudaMalloc((void **)&d_struct, sizeof(test_t));
    cudaMalloc((void **)&d_data, sizeof(float)*10);

    // Copy the data from host to device
    cudaMemcpy(d_data, arr, sizeof(float)*10, cudaMemcpyHostToDevice);
    // Point the host struct ptr to device memory
    h_struct->ptr = d_data;
    // copy the host struct to device
    cudaMemcpy(d_struct, h_struct, sizeof(test_t), cudaMemcpyHostToDevice);

    // Kernel Launch
    test_kernel<<<1,1>>>(d_struct);
    // copy the device array to host
    cudaMemcpy(arr, d_data, sizeof(float)*10, cudaMemcpyDeviceToHost);

    cudaFree(d_data);
    cudaFree(d_struct);

    // Verifying if all the values have been set to 100
    int i;
    for(i=0 ; i<10 ; i++)
        printf("%f\t", arr[i]);

    return 0;
}
Gilles
  • 9,269
  • 4
  • 34
  • 53
  • Checked and found this: https://devtalk.nvidia.com/default/topic/458705/is-there-any-limit-on-of-arguments-in-cuda-kernel-/ Apparently, the limit is 256B – Gilles Aug 28 '15 at 09:39
  • @Giles Yeah! Even I was looking at this. Let me modify my code and check it out. – Swaroop Aug 28 '15 at 10:09
  • BTW, no need of a initialisation kenel like described on the threads I pointed, just define a structure containing all your parameters (into an include file), fill it on the host side, allocate a pointer to it on the device side, copy the corresponding data from host to device and pass the pointer as argument of your kernel. That should just work with only an extra `d_arg->` added to access your arguments from inside the kernel. – Gilles Aug 28 '15 at 11:02
  • I have tried doing that but the problem is that, in the struct allocated on the device side, when I try to set any pointer inside this struct to point to a device side memory, it always get sets to NULL. – Swaroop Aug 31 '15 at 04:28
  • Yeah, it won't work. What you should put into your struct on the host side is the pointer on the device side, ie your `d_data` pointer, since this is the one you need on the device side... And then, you copy the struct from host to device as you did. Does that make sense? – Gilles Aug 31 '15 at 08:17
  • OK, I'll try to make it clear: what you need is `cudaMalloc(&d_struct,...);` then `cudaMalloc(&d_data,..);` then `h_struct->ptr = d_data;` and finally `cudaMemcpy(d_struct,h_struct,...);`. From there, call your kernel with `d_struct` as only parameter and on the device, access your data via `d_struct->ptr`. Is that clearer now? – Gilles Aug 31 '15 at 08:45
  • I am sorry, but I fail to understand how my code above fail to follow the instructions that you have just asked me to follow. You are clear, and I feel I am doing exactly what you have asked for, but still I get NULL pointer. – Swaroop Aug 31 '15 at 09:04
  • OK, my bad, I was focussed on the `h_struct->ptr = (float *)malloc(sizeof(float)*10);` which is just kind of pointless here, and missed the `h_struct->ptr = d_data;` which is the key part... Anyway, can you post a fully compilable and executable example please? – Gilles Aug 31 '15 at 09:10
  • I have posted fully compilable and executable example. – Swaroop Aug 31 '15 at 10:28
  • Let us [continue this discussion in chat](http://chat.stackoverflow.com/rooms/88385/discussion-between-gilles-and-guru-swaroop). – Gilles Aug 31 '15 at 15:38