Solved: Sorry, it's my fault, I should use atomicAdd(times,1);
instead of *times++
in the kernel function.
I call the kernel function like this
dim3 Dg(blockSize, blockSize, blockSize);
dim3 Db(8, 8, 8);
voxelize << < Dg, Db >> > ();
cudaDeviceSynchronize();
But I found that my program only solve the part of the problem, so I use printf()
in my global function voxelize ()
like the following code
__global__ void voxelize(){
printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %d\n", blockIdx.x, blockIdx.y, blockIdx.z);
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;
}
The output showed only the last part of each dimension runned( that is, the blockIdx.x is always 5, only some of the blockIndex.z are changing from 0 to 5).But I don't understand why, is there anything wrong when I call this kernel function? My computer is with the GTX1050Ti MaxQ and cuda 10.
After, I passed a pointer to the kernel to monitor the running times.
int blockSize = ceil(pow(triangles.size() 69664 / 512.0, 1.0 / 3));
dim3 Dg(blockSize, blockSize, blockSize);
dim3 Db(8, 8, 8);
int* times = new int(0);
int* gpu_times;
cudaMalloc((void **)&gpu_times, sizeof(int));
cudaMemcpy(gpu_times, times, sizeof(int), cudaMemcpyHostToDevice);
voxelize << < Dg, Db >> > (gpu_times);
cudaDeviceSynchronize();
cudaMemcpy(times, gpu_times, sizeof(int), cudaMemcpyDeviceToHost);
std::cout << *times << std::endl;
the kernel is modified as
__global__ void voxelize(int* times){
(*times)++;
printf("the thread blockIdx.x %d, blockIdx.y %d blockIdx.z %d\n", blockIdx.x, blockIdx.y, blockIdx.z);
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
unsigned int i = zIndex * blockDim.x*blockDim.y+ yIndex * blockDim.x+ xIndex;
}
the output is
the output shows it runs 141 times, but in fact, the output should be far more than 69664
sorry, it's my fault, I should use atomicAdd(times,1);
instead of *times++
.
But why does printf()
only output a part of the index as I described before?