Trying to use CUDA-GDB instead of brutal printfs in the code, I found it does not give proper values at the end of a large array.
The following is a simple code that creates and fills some values to the array of size 200,000. It contains some brutal printf function, and a dummy function just to make breakpoint during cuda-gdb. Dummy function includes some manipulation of array hoping the compiler does not neglect or optimize this function.
test.cu
#include <cstdlib>
#include <cstdio>
#include <cuda_runtime.h>
__global__ void fillArray(double4 *Array, int n) {
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index >= n) return;
double4 tmpArray;
tmpArray.x = (double)index + 0.1;
tmpArray.y = (double)index + 0.2;
tmpArray.z = (double)index + 0.3;
tmpArray.w = (double)index + 0.4;
Array[index] = tmpArray;
}
__global__ void printFromDevice(double4 *Array, int target) {
double4 tmpArray = Array[target];
printf("Array[%d] = %e %e %e %e \n", target, tmpArray.x, tmpArray.y, tmpArray.z, tmpArray.w);
}
__global__ void dummyFunction(double4 *Array, int target) {
double4 tmpArray;
tmpArray = Array[target/2];
tmpArray.x = -1234.;
tmpArray.y = -1235.;
tmpArray.z = -1236.;
tmpArray.w = -1237.;
// This is the 30th line
Array[target/2] = tmpArray;
}
int main(int argc, char **argv) {
int N = 200000;
double *dArray;
cudaMalloc((void **)&dArray, sizeof(double)*4*N);
int nThreads = 128;
int nBlocks = (N-1)/nThreads + 1;
fillArray<<<nBlocks,nThreads>>> ((double4 *)dArray, N);
printFromDevice<<<1,1>>> ((double4 *)dArray, 199999);
dummyFunction<<<1,1>>> ((double4 *)dArray, 100);
cudaDeviceSynchronize(); // This is the 46th line
return 0;
}
I compiled this via nvcc -g -G ./test.cu -o ./exe
and the following is CUDA-GDB results.
$ cuda-gdb ./exe
(cuda-gdb) b test.cu:30
(cuda-gdb) r
Thread 1 "exe" hit Breakpoint 1, dummyFunction<<<(1,1,1),(1,1,1)>>> (Array=0x2aaae9800000, target=100) at test.cu:31
31 Array[target/2] = tmpArray;
(cuda-gdb) print Array[199999]
Error: Failed to read generic memory at address 0x2aaae9e1a7e0 on device 0 sm 0 warp 1 lane 0, error=CUDBG_ERROR_INVALID_MEMORY_ACCESS(0x8).
(cuda-gdb) print Array[199998]
Error: Failed to read generic memory at address 0x2aaae9e1a7c0 on device 0 sm 0 warp 1 lane 0, error=CUDBG_ERROR_INVALID_MEMORY_ACCESS(0x8).
(cuda-gdb) print Array[199000]
$1 = {x = 199000.10000000001, y = 199000.20000000001, z = 199000.29999999999, w = 199000.39999999999}
(cuda-gdb) b test.cu:46
Breakpoint 2 at 0x403d0b: file test.cu, line 46.
(cuda-gdb) cont
Continuing.
Thread 1 "exe" hit Breakpoint 2, main (argc=1, argv=0x7fffffffda98) at test.cu:46
46 cudaDeviceSynchronize();
(cuda-gdb) print ((@global double4 *)dArray)[199999]
Error: Failed to read 32 bytes of global memory from 0x2aaae9e1a7e0
, error=CUDBG_ERROR_INVALID_MEMORY_ACCESS(0x8).
(cuda-gdb) cont
Continuing.
Array[199999] = 1.999991e+05 1.999992e+05 1.999993e+05 1.999994e+05
To summary,
- Memory is not an issue, with compute-sanitizer.
- Brutal printFromDevice gives proper result, thereby double4 array of size 200,000 is properly allocated and filled in the device memory.
- Until some point, CUDA-GDB gives the correct values but it fails to give proper value near the end of the array in both ways i) breakpoint in global function, ii) break point in the host function.
Why is this??
My CUDA-GDB version is like this
NVIDIA (R) CUDA Debugger
11.5 release
Portions Copyright (C) 2007-2021 NVIDIA Corporation
GNU gdb (GDB) 10.1
Copyright (C) 2020 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "x86_64-pc-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see: