0

nVidia Cuda nppiResize_32f_C1R works OK on grayscale 1 x 32f, but nppiResize_32f_C3R returns garbage. Clearly, a work around would be to call this routine 3 times by first de-interleaving the data as planar R, G, B, but I was expecting to be able to run it through in a single pass. nVidia has a lot of example code for single plane image processing, but a scant amount for interleaved color planes, so I'm turning to stackoverflow for help. I don't know how the stride is computed, but I understand that the stride is the image width times the number of bytes per column index. So in my case - with no padded lines - it should be width 32f x 3 for RGB.

Tried different strides/pitches in cudaMemcpy2D(). Can't get a workable solution for the color RGB code. Compiles & runs OK, no errors. The first section is for Grayscale (works OK). The second section is RGB (garbage).

    // nppiResize using 2D aligned allocations

    #include <Exceptions.h>
    #include <cuda_runtime.h>
    #include <npp.h>
    #include <nppi.h>
    #include <nppdefs.h>

    #define CUDA_CALL(call) do { cudaError_t cuda_error = call; if(cuda_error != cudaSuccess) { std::cerr << "CUDA Error: " << cudaGetErrorString(cuda_error) << ", " << __FILE__ << ", line " << __LINE__ << std::endl; return(NULL);} } while(0)

    float* decimate_cuda(float* readbuff, uint32_t nSrcH, uint32_t nSrcW, uint32_t nDstH, uint32_t nDstW, uint8_t byteperpixel)
    {
        if (byteperpixel == 1){ // source : Grayscale, 1 x 32f
            size_t  srcStep; 
            size_t  dstStep;

            NppiSize oSrcSize = {nSrcW, nSrcH};
            NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
            float *devSrc;
            CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, nSrcW * sizeof(float), nSrcH));
            CUDA_CALL(cudaMemcpy2D((void**)devSrc, srcStep,(void**)readbuff, nSrcW * sizeof(Npp32f), nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));

            NppiSize oDstSize = {nDstW, nDstH};     
            NppiRect oDstROI = {0, 0, nDstW, nDstH}; 
            float *devDst;
            CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, nDstW * sizeof(float), nDstH));

NppStatus result = nppiResize_32f_C1R(devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
            if (result != NPP_SUCCESS) {
                std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
            }

            Npp64s                 writesize;
            Npp32f                 *hostDst;
            writesize = (Npp64s)   nDstW * nDstH;         // Y
            if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
                printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
                exit(1);
            }

            CUDA_CALL(cudaMemcpy2D(hostDst, nDstW * sizeof(Npp32f),(void**)devDst, dstStep, nDstW * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));
            CUDA_CALL(cudaFree(devSrc));
            CUDA_CALL(cudaFree(devDst));
            return(hostDst);
        }                            // source : Grayscale 1 x 32f, YYYY...
        else if (byteperpixel == 3){ // source : 3 x 32f interleaved RGBRGBRGB...
            size_t  srcStep; 
            size_t  dstStep;
            // rows = height; columns = width

            NppiSize oSrcSize = {nSrcW, nSrcH};
            NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
            float *devSrc;
CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, 3 * nSrcW * sizeof(float), nSrcH));
CUDA_CALL(cudaMemcpy2D((void**)devSrc, srcStep, (void**)readbuff, 3 * nSrcW * sizeof(Npp32f), nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));

            NppiSize oDstSize = {nDstW, nDstH}; 
            NppiRect oDstROI = {0, 0, nDstW, nDstH};
            float *devDst;
CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, 3 * nDstW * sizeof(float), nDstH));

NppStatus result = nppiResize_32f_C3R((devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
if (result != NPP_SUCCESS) {
                std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
            }

            Npp64s                 writesize;
            Npp32f                 *hostDst;
            writesize = (Npp64s)   nDstW * nDstH * 3;          // RGB
            if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
                printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
                exit(1);
            }

            CUDA_CALL(cudaMemcpy2D((void**)hostDst, nDstW * sizeof(Npp32f), (void**)devDst, dstStep, nDstW * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));

            CUDA_CALL(cudaFree(devSrc));
            CUDA_CALL(cudaFree(devDst));
            return(hostDst);
        }        // source - 3 x 32f, interleaved RGBRGBRGB...

        return(0);
    }
sgarizvi
  • 16,623
  • 9
  • 64
  • 98
  • 2
    your final `cudaMemcpy2D` looks wrong. how could `nDstW * sizeof(Npp32f)` possibly be correct for a 3 channel image? – Robert Crovella Sep 30 '19 at 01:37
  • Thank you very much. In this case you're correct on speed differences between pitched & linear memory. I reverted to cudaMallocPitch thinking I would overcome the same issue I was having using nppiMalloc_32f_C3 , and today saw dst I was using nppiMalloc_32f_C1 (for RGB), whereas src was correctly allocated with nppiMalloc_32f_C3, notwithstanding the cudaMemcpy2D mistakes... desperation leads to recklessness. I'm new to CUDA. Thanks for the main(), and for making me aware its required. It will come in handy when debugging in the future. – plettplett Sep 30 '19 at 22:01
  • Hi, yes you're correct. It was a case of confusion.Thanks for taking a look & commenting. Would you know why NPPI_INTER_CUBIC, NPPI_INTER_LANCZOS, NPPI_INTER_CUBIC2P_CATMULLROM, NPPI_INTER_CUBIC2P_B05C03, NPPI_INTER_LANCZOS3_ADVANCED, NPPI_INTER_CUBIC2P_BSPLINE, NPPI_SMOOTH_EDGE all look the same when downsizing by a factor of around 4, height & width. The only filter that looks like its not "bi-linear" is NPPI_INTER_SUPER. I'm curious to see the downsizing results with LANCZOS. Thank you. – plettplett Sep 30 '19 at 22:10

1 Answers1

1

You had various errors in your calls to cudaMemcpy2D (both of them, in the 3 channel code). This code seems to work for me:

$ cat t1521.cu
    #include <cuda_runtime.h>
    #include <npp.h>
    #include <nppi.h>
    #include <nppdefs.h>
    #include <iostream>
    #include <stdint.h>
    #include <stdio.h>
    #define CUDA_CALL(call) do { cudaError_t cuda_error = call; if(cuda_error != cudaSuccess) { std::cerr << "CUDA Error: " << cudaGetErrorString(cuda_error) << ", " << __FILE__ << ", line " << __LINE__ << std::endl; return(NULL);} } while(0)
    using namespace std;
    float* decimate_cuda(float* readbuff, uint32_t nSrcH, uint32_t nSrcW, uint32_t nDstH, uint32_t nDstW, uint8_t byteperpixel)
    {
        if (byteperpixel == 1){ // source : Grayscale, 1 x 32f
            size_t  srcStep;
            size_t  dstStep;

            NppiSize oSrcSize = {nSrcW, nSrcH};
            NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
            float *devSrc;
            CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, nSrcW * sizeof(float), nSrcH));
            CUDA_CALL(cudaMemcpy2D(devSrc, srcStep,readbuff, nSrcW * sizeof(Npp32f), nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));

            NppiSize oDstSize = {nDstW, nDstH};
            NppiRect oDstROI = {0, 0, nDstW, nDstH};
            float *devDst;
            CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, nDstW * sizeof(float), nDstH));

            NppStatus result = nppiResize_32f_C1R(devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
            if (result != NPP_SUCCESS) {
                std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
            }

            Npp64s                 writesize;
            Npp32f                 *hostDst;
            writesize = (Npp64s)   nDstW * nDstH;         // Y
            if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
                printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
                exit(1);
            }

            CUDA_CALL(cudaMemcpy2D(hostDst, nDstW * sizeof(Npp32f),devDst, dstStep, nDstW * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));
            CUDA_CALL(cudaFree(devSrc));
            CUDA_CALL(cudaFree(devDst));
            return(hostDst);
        }                            // source : Grayscale 1 x 32f, YYYY...
        else if (byteperpixel == 3){ // source : 3 x 32f interleaved RGBRGBRGB...
            size_t  srcStep;
            size_t  dstStep;
            // rows = height; columns = width

            NppiSize oSrcSize = {nSrcW, nSrcH};
            NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
            float *devSrc;
            CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, 3 * nSrcW * sizeof(float), nSrcH));
            CUDA_CALL(cudaMemcpy2D(devSrc, srcStep,readbuff, 3 * nSrcW * sizeof(Npp32f), 3*nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));

            NppiSize oDstSize = {nDstW, nDstH};
            NppiRect oDstROI = {0, 0, nDstW, nDstH};
            float *devDst;
            CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, 3 * nDstW * sizeof(float), nDstH));

            NppStatus result = nppiResize_32f_C3R(devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
            if (result != NPP_SUCCESS) {
                std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
            }

            Npp64s                 writesize;
            Npp32f                 *hostDst;
            writesize = (Npp64s)   nDstW * nDstH * 3;          // RGB
            if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
                printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
                exit(1);
            }

            CUDA_CALL(cudaMemcpy2D(hostDst, nDstW*3 * sizeof(Npp32f), devDst, dstStep, nDstW*3 * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));

            CUDA_CALL(cudaFree(devSrc));
            CUDA_CALL(cudaFree(devDst));
            return(hostDst);
        }        // source - 3 x 32f, interleaved RGBRGBRGB...

        return(0);
    }

int main(){
    uint32_t nSrcH = 480;
    uint32_t nSrcW = 640;
    uint8_t byteperpixel = 3;
    float *readbuff = (float *)malloc(nSrcW*nSrcH*byteperpixel*sizeof(float));
    for (int i = 0; i < nSrcH*nSrcW; i++){
      readbuff [i*3+0] = 1.0f;
      readbuff [i*3+1] = 2.0f;
      readbuff [i*3+2] = 3.0f;}
    uint32_t nDstW = nSrcW/2;
    uint32_t nDstH = nSrcH/2;
    float *res =  decimate_cuda(readbuff, nSrcH, nSrcW, nDstH, nDstW, byteperpixel);
    for (int i = 0; i < nDstH*nDstW*byteperpixel; i++) if (res[i] != ((i%3)+1.0f)) {std::cout << "error at: " << i << std::endl; return 0;}
    return 0;
}
$ nvcc -o t1521 t1521.cu -lnppig
$ cuda-memcheck ./t1521
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

In the future, its convenient if you provide a complete code, just as I have done in my answer. In fact SO requires this, see item 1 here.

By the way, the use of pitched allocations on the device, here, which introduce complexity that you were not able to work your way through, should really be unnecessary both for correctness and performance, using any modern GPU and CUDA version. Ordinary linear/flat allocations, where pitch==width, should be just fine.

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257