I created a CUDA function for calculating the sum of an image using its histogram.
I'm trying to compile the kernel and the wrapper function for multiple compute capabilities.
Kernel:
__global__ void calc_hist(unsigned char* pSrc, int* hist, int width, int height, int pitch)
{
int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
#if __CUDA_ARCH__ > 110 //Shared Memory For Devices Above Compute 1.1
__shared__ int shared_hist[256];
#endif
int global_tid = yIndex * pitch + xIndex;
int block_tid = threadIdx.y * blockDim.x + threadIdx.x;
if(xIndex>=width || yIndex>=height) return;
#if __CUDA_ARCH__ == 110 //Calculate Histogram In Global Memory For Compute 1.1
atomicAdd(&hist[pSrc[global_tid]],1); /*< Atomic Add In Global Memory */
#elif __CUDA_ARCH__ > 110 //Calculate Histogram In Shared Memory For Compute Above 1.1
shared_hist[block_tid] = 0; /*< Clear Shared Memory */
__syncthreads();
atomicAdd(&shared_hist[pSrc[global_tid]],1); /*< Atomic Add In Shared Memory */
__syncthreads();
if(shared_hist[block_tid] > 0) /* Only Write Non Zero Bins Into Global Memory */
atomicAdd(&(hist[block_tid]),shared_hist[block_tid]);
#else
return; //Do Nothing For Devices Of Compute Capabilty 1.0
#endif
}
Wrapper Function:
int sum_8u_c1(unsigned char* pSrc, double* sum, int width, int height, int pitch, cudaStream_t stream = NULL)
{
#if __CUDA_ARCH__ == 100
printf("Compute Capability Not Supported\n");
return 0;
#else
int *hHist,*dHist;
cudaMalloc(&dHist,256*sizeof(int));
cudaHostAlloc(&hHist,256 * sizeof(int),cudaHostAllocDefault);
cudaMemsetAsync(dHist,0,256 * sizeof(int),stream);
dim3 Block(16,16);
dim3 Grid;
Grid.x = (width + Block.x - 1)/Block.x;
Grid.y = (height + Block.y - 1)/Block.y;
calc_hist<<<Grid,Block,0,stream>>>(pSrc,dHist,width,height,pitch);
cudaMemcpyAsync(hHist,dHist,256 * sizeof(int),cudaMemcpyDeviceToHost,stream);
cudaStreamSynchronize(stream);
(*sum) = 0.0;
for(int i=1; i<256; i++)
(*sum) += (hHist[i] * i);
printf("sum = %f\n",(*sum));
cudaFree(dHist);
cudaFreeHost(hHist);
return 1;
#endif
}
Question 1:
When compiling for sm_10
, the wrapper and the kernel shouldn't execute. But that is not what happens. The whole wrapper function executes. The output shows sum = 0.0
.
I expected the output to be Compute Capability Not Supported
as I have added the printf
statement in the start of the wrapper function.
How can I prevent the wrapper function from executing on sm_10
? I don't want to add any run-time checks like if statements etc. Can it be achieved through template meta programming?
Question 2:
When compiling for greater than sm_10
, the program executes correctly only if I add cudaStreamSynchronize
after the kernel call. But if I do not synchronize, the output is sum = 0.0
. Why is it happening? I want the function to be asynchronous w.r.t the host as much as possible. Is it possible to shift the only loop inside the kernel?
I am using GTX460M, CUDA 5.0, Visual Studio 2008 on Windows 8.