-2

I'm currently trying to implement the Blelloch Algorithm in CUDA. I want to use printf for debugging but it has this weird behaviour that it won't work when I'm accessing different positions of the same array in a Kernel. cudaDeviceSynchronize() is used after both Kernel calls.

Here is the Upsweep-Code:

__global__
void inclusive_scan_up_sweep(const Ray ray, float *scannedAngles)
    uint i = blockDim.x * blockIdx.x + threadIdx.x;
    uint index = (i * 2) + 1;
    int depth = log2((double)ray.length);

    for (int d = 0; d < depth; d++) {
        uint stride = pow(2.0, (double)d);
        if (((index + 1) % stride) == 0) {
            //this line stops printf from working
            //printf works if I remove '+ scannedAngles[index - stride]' from the equation
            scannedAngles[index] = scannedAngles[index] + scannedAngles[index - stride]; 
        }
        __syncthreads();
    }
}

And here is the Downsweep-Code:

__global__
void inclusive_scan_down_sweep(const Ray ray, float *scannedAngles)
{
    uint i = blockDim.x * blockIdx.x + threadIdx.x;
    uint index = (i * 2) + 1;
    int depth = log2((double)ray.length);

    //first zero last element
    if ((index + 1) == ray.length)
        scannedAngles[index] = 0;

    for (int d = depth - 1; d >= 0; d--) {
        uint stride = pow(2.0, (double)d);
        if (((index + 1) % stride) == 0) {
            float tmp = scannedAngles[index - stride];
            // these two line cause the same issue, however the second line
            // does not create any issues when I replace the 'tmp' with a fixed number 
            scannedAngles[index - stride] = scannedAngles[index];
            scannedAngles[index] += tmp;
        }
        __syncthreads();
    }

    printf("Thread: %d Result: %f\n", (index - 1), scannedAngles[index - 1]);
}

Does anyone know what's going on here? Is this maybe some weird side effect of memory access optimization?

1 Answers1

-3

It seems the issue was simply that index and stride were unsigned int and the brackets operator interpreted them as signed int, most likely causing and access violation and with that the kernel crashed.