0

I am trying to invoke a CUDA kernel that calculates a vector dot product from Java via JCuda. The kernel works fine when invoked via C++, but with JCuda, I get a coredump from libcuda.so. I've compared the C++ and Java invocations, as well as compared to the vector_add example provided with the JCuda samples, and cannot find the error/discrepancy. I've pasted the working kernel, C++ and JCuda invocations below.

If anyone can see where I've gone wrong I'd really appreciate it!

Kernel:

/**
 * dotproduct_cuda - this is the kernal for the GPU
 * a: input vector a
 * b: input vector b
 * result: float for result
 * N: size of input vectors
 * verbose: boolean to print additional debug statements
 */
extern "C"
__global__
void __dotproduct_cuda(float *a, float *b, float *result, size_t N, bool verbose) {
    __shared__
    float temp[THREADS_PER_BLOCK];

    if (verbose)
        printf("    Start of block %u thread %u (blockDim %u gridDim %u)\n", blockIdx.x, threadIdx.x, blockDim.x, gridDim.x);

    // grid-stride loop
    // stride size is blockDim.x * gridDim.x - if n < stride, the loop executes exactly once
    temp[threadIdx.x] = 0;
    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) 
    {
        if (verbose)
            printf("    Thread[%u,%u] += %f * %f\n", blockIdx.x, threadIdx.x, a[i], b[i]);
        temp[threadIdx.x] += a[i] * b[i];
    }

    // Make sure all threads are done multiplying before aggregating the results
    __syncthreads();

    // Thread 0 aggregates all of the results
    if (threadIdx.x == 0) {
        float sum = 0;
        for (int i = 0; i < blockDim.x; i++) {
            sum += temp[i];
        }
        if (verbose)
            printf("    Total for block %u: %f\n", blockIdx.x, sum);
        atomicAdd(result, sum);
    }
}

Invocation via C++:

    float result_gpu = 0;

    // Allocate device memory
    cudaMalloc((void**) &d_a, sizeof(float) * size);
    cudaMalloc((void**) &d_b, sizeof(float) * size);
    cudaMalloc((void**) &d_result, sizeof(float)); // a single float

    // Transfer data from host to device memory
    cudaMemcpy(d_a, a, sizeof(float) * size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_result, &result_gpu, sizeof(float), cudaMemcpyHostToDevice);

    // Determine our size requirements
    // Once N exceeds MAX_BLOCKS * THREADS_PER_BLOCK, the grid-stride pattern is used
    if (threads == 0)
        threads = THREADS_PER_BLOCK;
    if (blocks == 0) {
        blocks = ceil((float) N / THREADS_PER_BLOCK);
        if (blocks == 1)
            threads = N;
        if (blocks > MAX_BLOCKS)  // this will trigger grid-stride loops
            blocks = MAX_BLOCKS;
    }
    if (verbose)
        printf("blocks %d, threads %d\n", blocks, threads);

    // Execute kernel
    __dotproduct_cuda<<< blocks, threads >>>( d_a, d_b, d_result, N, verbose);

    // Make the host block until the device is finished
    cudaDeviceSynchronize();

    // Check for CUDA errors
    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess) {
        printf("CUDA error: %s\n", cudaGetErrorString(error));
        exit(-1);
    }

    // Transfer result back to host memory
    cudaMemcpy(&result_gpu, d_result, sizeof(float), cudaMemcpyDeviceToHost);

    // Deallocate host memory
    free(a);
    free(b);

    // Deallocate device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_result);
}

Invocation via JCuda:

    // Allocate the device input data, and copy the host input data to the device
    CUdeviceptr deviceInputA = new CUdeviceptr();
    cuMemAlloc(deviceInputA, numSamples * Sizeof.FLOAT);
    cuMemcpyHtoD(deviceInputA, Pointer.to(hostInputA), numSamples * Sizeof.FLOAT);
    CUdeviceptr deviceInputB = new CUdeviceptr();
    cuMemAlloc(deviceInputB, numSamples * Sizeof.FLOAT);
    cuMemcpyHtoD(deviceInputB, Pointer.to(hostInputB), numSamples * Sizeof.FLOAT);

    // Allocate device output memory
    CUdeviceptr deviceOutput = new CUdeviceptr();
    cuMemAlloc(deviceOutput, Sizeof.FLOAT);

    // Set up the kernel parameters: A pointer to an array
    // of pointers which point to the actual values.
    Pointer kernelParameters = Pointer.to(
            Pointer.to(deviceInputA),
            Pointer.to(deviceInputB),
            Pointer.to(deviceOutput),
            Pointer.to(new int[] { numSamples, (kernelVerbose ? 1 : 0) }));

    // Determine our size requirements
    // Once N exceeds MAX_BLOCKS *THREADS_PER_BLOCK, the grid-stride pattern is used
    if (blockSizeX == 0)
        blockSizeX = DEFAULT_BLOCK_SIZE;
    if (gridSizeX == 0) {
        gridSizeX = (int) Math.ceil((float) numSamples / DEFAULT_BLOCK_SIZE);
        if (gridSizeX == 1)
            blockSizeX = numSamples;
        if (gridSizeX > MAX_BLOCKS) // this will trigger grid-stride loops
            gridSizeX = MAX_BLOCKS;
    }

    logger.debug("blocks {}, threads {}", gridSizeX, blockSizeX);

    // Call the kernel function.
    // __dotproduct_cuda<<< blocks, threads >>>( d_a, d_b, * d_result, N, verbose);
    cuLaunchKernel(function,
            gridSizeX, 1, 1, // Grid dimension
            blockSizeX, 1, 1, // Block dimension
            32768, null, // Shared memory size and stream
            kernelParameters, null // Kernel- and extra parameters
    );

    logger.debug("Kernel launched");

    // Synchronize the devices
    cuCtxSynchronize();

    logger.debug("Context synchronized");

    // Allocate host output memory and copy the device output to the host.
    float[] hostOutput = new float[1];
    cuMemcpyDtoH(Pointer.to(hostOutput), deviceOutput, Sizeof.FLOAT);

    // Clean up.
    cuMemFree(deviceInputA);
    cuMemFree(deviceInputB);
    cuMemFree(deviceOutput);
Richard Sand
  • 642
  • 6
  • 20

1 Answers1

1

The problem was in the pointers for the variables. I had to change:

Pointer.to(new int[] { numSamples, (kernelVerbose ? 1 : 0) })

to

Pointer.to(new int[] { numSamples }),
Pointer.to(new int[] { (kernelVerbose ? 1 : 0) })

Hope this helps someone else!

Richard Sand
  • 642
  • 6
  • 20