nVidia Cuda nppiResize_32f_C1R works OK on grayscale 1 x 32f, but nppiResize_32f_C3R returns garbage. Clearly, a work around would be to call this routine 3 times by first de-interleaving the data as planar R, G, B, but I was expecting to be able to run it through in a single pass. nVidia has a lot of example code for single plane image processing, but a scant amount for interleaved color planes, so I'm turning to stackoverflow for help. I don't know how the stride is computed, but I understand that the stride is the image width times the number of bytes per column index. So in my case - with no padded lines - it should be width 32f x 3 for RGB.
Tried different strides/pitches in cudaMemcpy2D(). Can't get a workable solution for the color RGB code. Compiles & runs OK, no errors. The first section is for Grayscale (works OK). The second section is RGB (garbage).
// nppiResize using 2D aligned allocations
#include <Exceptions.h>
#include <cuda_runtime.h>
#include <npp.h>
#include <nppi.h>
#include <nppdefs.h>
#define CUDA_CALL(call) do { cudaError_t cuda_error = call; if(cuda_error != cudaSuccess) { std::cerr << "CUDA Error: " << cudaGetErrorString(cuda_error) << ", " << __FILE__ << ", line " << __LINE__ << std::endl; return(NULL);} } while(0)
float* decimate_cuda(float* readbuff, uint32_t nSrcH, uint32_t nSrcW, uint32_t nDstH, uint32_t nDstW, uint8_t byteperpixel)
{
if (byteperpixel == 1){ // source : Grayscale, 1 x 32f
size_t srcStep;
size_t dstStep;
NppiSize oSrcSize = {nSrcW, nSrcH};
NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
float *devSrc;
CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, nSrcW * sizeof(float), nSrcH));
CUDA_CALL(cudaMemcpy2D((void**)devSrc, srcStep,(void**)readbuff, nSrcW * sizeof(Npp32f), nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));
NppiSize oDstSize = {nDstW, nDstH};
NppiRect oDstROI = {0, 0, nDstW, nDstH};
float *devDst;
CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, nDstW * sizeof(float), nDstH));
NppStatus result = nppiResize_32f_C1R(devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
if (result != NPP_SUCCESS) {
std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
}
Npp64s writesize;
Npp32f *hostDst;
writesize = (Npp64s) nDstW * nDstH; // Y
if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
exit(1);
}
CUDA_CALL(cudaMemcpy2D(hostDst, nDstW * sizeof(Npp32f),(void**)devDst, dstStep, nDstW * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));
CUDA_CALL(cudaFree(devSrc));
CUDA_CALL(cudaFree(devDst));
return(hostDst);
} // source : Grayscale 1 x 32f, YYYY...
else if (byteperpixel == 3){ // source : 3 x 32f interleaved RGBRGBRGB...
size_t srcStep;
size_t dstStep;
// rows = height; columns = width
NppiSize oSrcSize = {nSrcW, nSrcH};
NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
float *devSrc;
CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, 3 * nSrcW * sizeof(float), nSrcH));
CUDA_CALL(cudaMemcpy2D((void**)devSrc, srcStep, (void**)readbuff, 3 * nSrcW * sizeof(Npp32f), nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));
NppiSize oDstSize = {nDstW, nDstH};
NppiRect oDstROI = {0, 0, nDstW, nDstH};
float *devDst;
CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, 3 * nDstW * sizeof(float), nDstH));
NppStatus result = nppiResize_32f_C3R((devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
if (result != NPP_SUCCESS) {
std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
}
Npp64s writesize;
Npp32f *hostDst;
writesize = (Npp64s) nDstW * nDstH * 3; // RGB
if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
exit(1);
}
CUDA_CALL(cudaMemcpy2D((void**)hostDst, nDstW * sizeof(Npp32f), (void**)devDst, dstStep, nDstW * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));
CUDA_CALL(cudaFree(devSrc));
CUDA_CALL(cudaFree(devDst));
return(hostDst);
} // source - 3 x 32f, interleaved RGBRGBRGB...
return(0);
}