Cuda NPP Filter Median functions

Question

Was wondering if any one more versed in how to call this NPP cuda function could tell me where the mistake is occurring?


#define gpuErrchk(ans) gpuAssert((ans), __FILE__, __LINE__)
inline int gpuAssert(cudaError_t code, const char *file, int line)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        return 1;
    }
    return 0;
}

int gpuMedfilt2(const float* pSrc, float* pDst, int M, int N, int winSize)
{

    NppStatus status;
    Npp32f* d_in, *d_out;
    Npp32s nSrcStep = N * sizeof(float), nDstStep = N * sizeof(float);
    NppiSize oSizeROI = {N, M};
    NppiSize oMaskSize = {winSize, winSize};
    NppiPoint oAnchor = {oMaskSize.width / 2, oMaskSize.height / 2};
    Npp8u* pBuffer;
    Npp32u pBufferSize;
    size_t d_in_pitch, d_out_pitch;


    if (gpuErrchk(cudaMallocPitch((void**)&d_in, &d_in_pitch, N * sizeof(float), M)))
        return 0;

    if (gpuErrchk(cudaMallocPitch((void**)&d_out, &d_out_pitch, N * sizeof(float), M)))
    {
        cudaFree((void*)d_in);
        return 0;
    }
    if (gpuErrchk(cudaMemcpy2D((void*)d_in, d_in_pitch, (const void*)pSrc, nSrcStep, N * sizeof(float), M, cudaMemcpyHostToDevice)))
    {
        cudaFree((void*)d_in);
        cudaFree((void*)d_out);
        return 0;
    }

    if ((status = nppiFilterMedianGetBufferSize_32f_C1R(oSizeROI, oMaskSize, &pBufferSize)) != NPP_SUCCESS)
    {
        fprintf(stderr, "NPP Error: Failed to calculate buffer space for median filter operation\n");
        cudaFree((void*)d_in);
        cudaFree((void*)d_out);
        return 0;
    }

    if (gpuErrchk(cudaMalloc((void**)&pBuffer, pBufferSize)))
    {
        fprintf(stderr, "NPP Error: Failed to allocate buffer space for median filter operation\n");
        cudaFree((void*)d_in);
        cudaFree((void*)d_out);
        return 0;
    }

    if ((status = nppiFilterMedian_32f_C1R(d_in, d_in_pitch, d_out, d_out_pitch, oSizeROI, oMaskSize, oAnchor, pBuffer)) != NPP_SUCCESS)
    {
        fprintf(stderr, "NPP Error: Failed to call nppiFilterMedian_32f_C1R function\n");
        cudaFree((void*)pBuffer);
        cudaFree((void*)d_in);
        cudaFree((void*)d_out);
        return 0;
    }

    if (gpuErrchk(cudaMemcpy2D((void*)pDst, nDstStep, (const void*)d_out, d_out_pitch, sizeof(float) * N, M, cudaMemcpyDeviceToHost)))
    {
        cudaFree((void*)pBuffer);
        cudaFree((void*)d_in);
        cudaFree((void*)d_out);
        return 0;
    }

    return 1;
}

compute-sanitizer shows many errors.

My main looks like this,

float* in = malloc(sizeof(float) * M * N);
float* out = malloc(sizeof(float) * M * N);
gpuMedfilt2(in, out, M, N, 5);

Thank you for any insight.

I feel like something is wrong with the cudaMallocPitch and the cudaMemcpy2D. I am not getting the memory sizes right?

Here is output from compute-sanitizer, this occurs around ~100+ times.

========= Invalid __global__ read of size 4 bytes
=========     at 0x2b8 in void FilterMedianKernelSortingNetworkShared::RunKernel5x5<float, (int)1, (int)1, (int)25>(Pixel<T1, T2> *, int, NppiSize, NppiSize, const Pixel<T1, T2> *, int, int)
=========     by thread (1,0,0) in block (0,2,0)
=========     Address 0x701d1fffc is out of bounds
=========     and is 4 bytes before the nearest allocation at 0x701d20000 of size 25,600 bytes
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x30b442]
=========                in /lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0x393adb]
=========                in /home/rctodd/cuda11.7/lib/libnppif.so.11
=========     Host Frame: [0x3ef278]
=========                in /home/rctodd/cuda11.7/lib/libnppif.so.11
=========     Host Frame: [0x140046]
=========                in /home/rctodd/cuda11.7/lib/libnppif.so.11
=========     Host Frame: [0x1401cb]
=========                in /home/rctodd/cuda11.7/lib/libnppif.so.11
=========     Host Frame:nppiFilterMedian_32f_C1R [0x12266f]
=========                in /home/rctodd/cuda11.7/lib/libnppif.so.11
=========     Host Frame:gpuMedfilt2 [0x1601]
=========                in /home/rctodd/code/cuda/cuMedfilt2/libgpuMedfilt2.so
=========     Host Frame:main [0x127e]
=========                in /home/rctodd/code/cuda/cuMedfilt2/./app
=========     Host Frame:../csu/libc-start.c:342:__libc_start_main [0x24083]
=========                in /lib/x86_64-linux-gnu/libc.so.6
=========     Host Frame:_start [0x110e]
=========                in /home/rctodd/code/cuda/cuMedfilt2/./app
=========

ldd of my shared library I compile against

ldd libgpuMedfilt2.so 
    linux-vdso.so.1 (0x00007fff6533d000)
    libcudart.so.11.0 => /home/rctodd/cuda11.7/lib/libcudart.so.11.0 (0x00007f1bb2911000)
    libnppif.so.11 => /home/rctodd/cuda11.7/lib/libnppif.so.11 (0x00007f1badfbc000)
    libstdc++.so.6 => /lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007f1baddc8000)
    libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f1badbd6000)
    /lib64/ld-linux-x86-64.so.2 (0x00007f1bb2bbd000)
    libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f1badbd0000)
    libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f1badbad000)
    librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007f1badba1000)
    libnppc.so.11 => /home/rctodd/cuda11.7/lib/libnppc.so.11 (0x00007f1bad813000)
    libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f1bad6c4000)
    libgcc_s.so.1 => /home/rctodd/cuda11.7/lib/libgcc_s.so.1 (0x00007f1bad6ab000)

Here is my compile commands

# Shared library
/./home/rctodd/cuda11.7/bin/nvcc -o libgpuMedfilt2.so -shared gpuMedfilt2.cu --compiler-options '-fPIC' -Xlinker -L/home/rctodd/cuda11.7/lib -Xlinker -rpath=/home/rctodd/cuda11.7/lib -lcudart -lnppif -arch=sm_50
# Application
gcc -o app main.c -L$(pwd) -Wl,-rpath=$(pwd) -lgpuMedfilt2

Robert Crovella · Answer 1 · 2023-05-26T21:09:35.360

The "ordinary" filtering functions provided by NPP expect that any placement of the mask/filter kernel will land on properly defined pixels in the image. The ramification of this is that you cannot filter an input image edge-to-edge this way. You must leave an unfiltered boundary, the size of which will depend on your mask/filter kernel dimensions. (Some NPP filter functions provide a Boundary variant which will have "automatic" handling of boundary pixels, i.e. pixels needed for calculation but which fall outside of the defined image, but median filter is not one of those. )

Your code violates this expectation, so its not surprising that compute-sanitizer reports illegal, out-of-bounds accesses.

A typical method to address this expectation is to restrict the filter to a region that "fits within" the original image, leaving enough border area of defined pixels so that the placement of the filter within the filter region always selects defined pixels (from the original image) within the filter kernel area.

The choice of filter kernel anchor pixel will affect this, but you have chosen a "typical" anchor at the center of the filter kernel.

Therefore, in your case we can filter a "central region", leaving a boundary of 2 pixels unfiltered, at the top, bottom, left, and right of the original image, resulting in a filtered image that is 4 pixels less than the original dimensions for horizontal and vertical

Here is a worked example. The median filter is interesting for several reasons. One of its capabilities is to leave image edges "intact" while still offering something like a "low-pass" filter effect for "noise" having certain properties. The following example demonstrates that:

$ cat t19.cu
#include <npp.h>
#include <nppi.h>
#include <cstdio>
#include <iostream>

#define gpuErrchk(ans) gpuAssert((ans), __FILE__, __LINE__)
inline int gpuAssert(cudaError_t code, const char *file, int line)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        return 1;
    }
    return 0;
}

int gpuMedfilt2(const float* pSrc, float* pDst, int ih, int iw, int winSize)
{

    NppStatus status;
    Npp32f* d_in, *d_out;
    Npp32s nSrcStep = iw * sizeof(float), nDstStep = iw * sizeof(float);
    NppiSize oMaskSize = {winSize, winSize};
    NppiPoint oAnchor = {oMaskSize.width / 2, oMaskSize.height / 2};
    NppiSize oSizeROI = {iw-2*oAnchor.x, ih-2*oAnchor.y};
    Npp8u* pBuffer;
    Npp32u pBufferSize;
    size_t d_in_pitch, d_out_pitch;

    if (gpuErrchk(cudaMallocPitch((void**)&d_in, &d_in_pitch, iw * sizeof(float), ih)))
        return 0;

    if (gpuErrchk(cudaMallocPitch((void**)&d_out, &d_out_pitch, iw * sizeof(float), ih)))
    {
        cudaFree((void*)d_in);
        return 0;
    }
    if (gpuErrchk(cudaMemcpy2D((void*)(d_in), d_in_pitch, (const void*)pSrc, nSrcStep, iw * sizeof(float), ih, cudaMemcpyHostToDevice)))
    {
        cudaFree((void*)d_in);
        cudaFree((void*)d_out);
        return 0;
    }

    if ((status = nppiFilterMedianGetBufferSize_32f_C1R(oSizeROI, oMaskSize, &pBufferSize)) != NPP_SUCCESS)
    {
        fprintf(stderr, "NPP Error: Failed to calculate buffer space for median filter operation\n");
        cudaFree((void*)d_in);
        cudaFree((void*)d_out);
        return 0;
    }

    if (gpuErrchk(cudaMalloc((void**)&pBuffer, pBufferSize)))
    {
        fprintf(stderr, "NPP Error: Failed to allocate buffer space for median filter operation\n");
        cudaFree((void*)d_in);
        cudaFree((void*)d_out);
        return 0;
    }
    cudaMemset(d_out, 0, ih*d_out_pitch); // filter will not touch every output pixel
    if ((status = nppiFilterMedian_32f_C1R((float *)((unsigned char *)d_in+oAnchor.y*d_in_pitch)+oAnchor.x, d_in_pitch, (float *)((unsigned char *)d_out+oAnchor.y*d_out_pitch)+oAnchor.x, d_out_pitch, oSizeROI, oMaskSize, oAnchor, pBuffer)) != NPP_SUCCESS)
    {
        fprintf(stderr, "NPP Error: Failed to call nppiFilterMedian_32f_C1R function\n");
        cudaFree((void*)pBuffer);
        cudaFree((void*)d_in);
        cudaFree((void*)d_out);
        return 0;
    }

    if (gpuErrchk(cudaMemcpy2D((void*)pDst, nDstStep, (const void*)(d_out), d_out_pitch, sizeof(float) * iw, ih, cudaMemcpyDeviceToHost)))
    {
        cudaFree((void*)pBuffer);
        cudaFree((void*)d_in);
        cudaFree((void*)d_out);
        return 0;
    }

    return (int)status;
}

int main(){
    const int sz = 36;
    const int iw = sz;
    const int ih = sz;
    const int ms = 5;
    float* in = (float *)malloc(sizeof(float) * ih * iw);
    float* out = (float *)malloc(sizeof(float) * ih * iw);
    for (int i = 0; i < sz; i++)
      for (int j = 0; j < sz; j++) {
        float pix_val = (j>(sz/2))?1.0f:0;  // create image with edge
        if ((j%ms==0)&&(i%ms==0)) pix_val += 0.1f; // additive noise
        in[i*sz+j] = pix_val;}
    gpuMedfilt2(in, out, ih, iw, ms);
    for (int i = 0; i < sz; i++) {
      for (int j = 0; j < sz; j++)
        std::cout << out[i*sz+j] << " ";
      std::cout << std::endl;
      }
}
$ nvcc -o t19 t19.cu -lnpps -lnppif
$ compute-sanitizer ./t19
========= COMPUTE-SANITIZER
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
========= ERROR SUMMARY: 0 errors
$

We note that the additive noise is gone, and the vertical edge is "intact" (i.e. unchanged) at the center of the image. We also note that there appears to be a boundary of 2 pixels all the way around, which are unfiltered (and set to 0 by the cudaMemset operation). Another possible method to handle the output border region (rather than setting it to zero) would be to copy the input image to the output image, before the filtering operation, in place of the cudaMemset operation, effectively setting output pixels equal to input pixels, in the border region.

Ahh, so because I was giving the base pointer, where the allocation started, and my anchor point was at the middle of the mask, the median filter function was actually grabbing memory at the -winSize/2 row and -winSize/2 column to get info for the median value at the [0,0] index? I thought that might have been happening, but then again I figured it wouldn't try and grab any memory addresses outside of the region of interest. — Batmanslow23, May 27 '23 at 23:02
I am trying to speed up Matlabs medfilt2 function for float/single inputs. But with the memory copying, there isn't any noticeable speed up. — Batmanslow23, May 27 '23 at 23:05

Cuda NPP Filter Median functions

1 Answers1