I am having an issue with the following code. The following code takes an input image and it should save the grayscale of it. Unfortunately, it seems to perform the expected behavior but it is processing just a part of the image and not the whole. It seems that the problems occurs in the cudamemcpy from device to host.
i believe that probably I got some issue while I am allocating memory in Cuda.
__global__ void rgb2grayCudaKernel(unsigned char *inputImage, unsigned char *grayImage, const int width, const int height)
{
int ty = (blockIdx.x * blockDim.x) + threadIdx.x;
//int tx = (blockIdx.x * blockDim.x) + threadIdx.x;
int tx = (blockIdx.y * blockDim.y) + threadIdx.y;
if( (ty < height && tx<width) )
{
float grayPix = 0.0f;
float r = static_cast< float >(inputImage[(ty * width) + tx]);
float g = static_cast< float >(inputImage[(width * height) + (ty * width) + tx]);
float b = static_cast< float >(inputImage[(2 * width * height) + (ty * width) + tx]);
grayPix = (0.3f * r) + (0.59f * g) + (0.11f * b);
grayImage[(ty * width) + tx] = static_cast< unsigned char >(grayPix);
}
}
//***************************************rgb2gray function, call of kernel in here *************************************
void rgb2grayCuda(unsigned char *inputImage, unsigned char *grayImage, const int width, const int height)
{
unsigned char *inputImage_c, *grayImage_c;
const int sizee= (width*height);
// **********memory allocation for pointers and cuda******************
cudaMalloc((void **) &inputImage_c, sizee);
checkCudaError("im not alloc!");
cudaMalloc((void **) &grayImage_c, sizee);
checkCudaError("gray not alloc !");
//***********copy to device*************************
cudaMemcpy(inputImage_c, inputImage, sizee*sizeof(unsigned char), cudaMemcpyHostToDevice);
checkCudaError("im not send !");
cudaMemcpy(grayImage_c, grayImage, sizee*sizeof(unsigned char), cudaMemcpyHostToDevice);
checkCudaError("gray not send !");
dim3 thrb(32,32);
dim3 numb (ceil(width*height/1024));
//**************Execute Kernel (Timer in here)**************************
NSTimer kernelTime = NSTimer("kernelTime", false, false);
kernelTime.start();
rgb2grayCudaKernel<<<numb,1024>>> (inputImage_c, grayImage_c, width, height);
checkCudaError("kernel!");
kernelTime.stop();
//**************copy back to host*************************
printf("/c");
cudaMemcpy(grayImage, grayImage_c, sizee*sizeof(unsigned char), cudaMemcpyDeviceToHost);
checkCudaError("Receiving data from CPU failed!");
//*********************free memory***************************
cudaFree(inputImage_c);
cudaFree(grayImage_c);
//**********************print time****************
cout << fixed << setprecision(6);
cout << "rgb2gray (cpu): \t\t" << kernelTime.getElapsed() << " seconds." << endl;
}