There appear to be 2 categories of problems with your code.
First, you have various problems with data sizes. Not sure where the disconnect is here, since some of it you have right, so I'll just point out the things I see.
This construct is correct:
auto err = cudaMalloc((void**)&gpuImg1, dataSize1 * dataSize1 * sizeof(unsigned char));
You should have done the same thing here:
err = cudaMalloc((void**)&gpuImg2, dataSize2);
These are not correct. cudaMemcpy
, like memcpy
, and like cudaMalloc
takes a size parameter in bytes:
err = cudaMemcpy(gpuImg1, img1Data, dataSize1, cudaMemcpyHostToDevice);
err = cudaMemcpy(gpuImg2, img2data, dataSize2, cudaMemcpyHostToDevice);
^^^^^^^^^
You got it almost correct on the subsequent copy from device to host, except that your sizeof
should be for the correct type:
err = cudaMemcpy(resData, gpuDest, resSize * resSize * sizeof(Npp8u), cudaMemcpyDeviceToHost);
^^^^^
Second, you are using the normalized version of the cross correlation. If you study the documentation, I believe you will find that it is possible for the denominator to be calculated as the square-root of zero, when large portions of your image are zero-valued. Anyway, when I convert the "background" from 0 to 1, I get sensible results. Another option would be to switch to the non-normalized version of the function (nppiCrossCorrValid_8u32f_C1R
) which also yields non-NAN results, even with large areas of zero "background".
Here is a corrected version, I think it will give you non-NAN results:
# cat t14.cu
#include <npp.h>
#include <iostream>
int main(int argc, char* argv[])
{
Npp8u* gpuImg1, * gpuImg2;
Npp32f *gpuDest;
// cudaDeviceInit(argc, (const char**)argv);
long dataSize1 = 128;
auto err = cudaMalloc((void**)&gpuImg1, dataSize1 * dataSize1 * sizeof(unsigned char));
unsigned char *img1Data = static_cast<unsigned char*>(malloc(dataSize1 * dataSize1 * sizeof(unsigned char)));
memset(img1Data, 1, dataSize1 * dataSize1);
for(auto y = 40; y < 60; y++)
{
for(auto x = 20; x < 40; x++)
{
img1Data[y * dataSize1 + x] = 0xff;
}
}
long dataSize2 = 64;
err = cudaMalloc((void**)&gpuImg2, dataSize2*dataSize2 *sizeof(unsigned char));
unsigned char *img2data = static_cast<unsigned char*>(malloc(dataSize2 * dataSize2 * sizeof(unsigned char)));
memset(img2data, 1, dataSize2 * dataSize2);
for (auto y = 10; y < 30; y++)
{
for (auto x = 20; x < 40; x++)
{
img2data[y * dataSize2 + x] = 0xff;
}
}
auto resSize = (dataSize1 - dataSize2) + 1;
err = cudaMalloc((void**)&gpuDest, resSize * resSize * sizeof(Npp32f));
auto resData = static_cast<Npp32f*>(malloc(resSize * resSize * sizeof(Npp32f)));
NppiSize nppiSize1;
nppiSize1.height = dataSize1;
nppiSize1.width = dataSize1;
NppiSize nppiSize2;
nppiSize2.height = dataSize2;
nppiSize2.width = dataSize2;
err = cudaMemcpy(gpuImg1, img1Data, dataSize1*dataSize1*sizeof(unsigned char), cudaMemcpyHostToDevice);
err = cudaMemcpy(gpuImg2, img2data, dataSize2*dataSize2*sizeof(unsigned char), cudaMemcpyHostToDevice);
auto status = nppiCrossCorrValid_Norm_8u32f_C1R(gpuImg1, dataSize1, nppiSize1, gpuImg2, dataSize2, nppiSize2, gpuDest, resSize * sizeof(Npp32f));
err = cudaMemcpy(resData, gpuDest, resSize * resSize * sizeof(Npp32f), cudaMemcpyDeviceToHost);
for (int i = 0; i < resSize*2; i++)
std::cout << resData[i] << ",";
std::cout << std::endl;
}
# nvcc -std=c++11 -o t14 t14.cu -lnppc -lnppist
# cuda-memcheck ./t14
========= CUDA-MEMCHECK
0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796924,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00797587,0.00798853,0.00800826,0.00803633,0.00807432,0.00812423,0.00818861,0.00827071,0.00837505,0.00850754,0.00867648,0.00889385,0.00917761,0.00955609,0.0100771,0.0108291,0.0119988,0.0140744,0.0190166,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796925,0.00796926,0.00796926,0.00796926,0.00796926,0.00797588,0.00798854,0.00800827,0.00803634,0.00807434,0.00812425,0.00818863,0.00827071,0.00837505,0.00850754,0.00867648,0.00889385,0.00917761,0.00955609,0.0100771,0.0108291,0.0119988,0.0140744,0.0190166,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,0.323817,
========= ERROR SUMMARY: 0 errors
#