Was wondering if any one more versed in how to call this NPP cuda function could tell me where the mistake is occurring?
#define gpuErrchk(ans) gpuAssert((ans), __FILE__, __LINE__)
inline int gpuAssert(cudaError_t code, const char *file, int line)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
return 1;
}
return 0;
}
int gpuMedfilt2(const float* pSrc, float* pDst, int M, int N, int winSize)
{
NppStatus status;
Npp32f* d_in, *d_out;
Npp32s nSrcStep = N * sizeof(float), nDstStep = N * sizeof(float);
NppiSize oSizeROI = {N, M};
NppiSize oMaskSize = {winSize, winSize};
NppiPoint oAnchor = {oMaskSize.width / 2, oMaskSize.height / 2};
Npp8u* pBuffer;
Npp32u pBufferSize;
size_t d_in_pitch, d_out_pitch;
if (gpuErrchk(cudaMallocPitch((void**)&d_in, &d_in_pitch, N * sizeof(float), M)))
return 0;
if (gpuErrchk(cudaMallocPitch((void**)&d_out, &d_out_pitch, N * sizeof(float), M)))
{
cudaFree((void*)d_in);
return 0;
}
if (gpuErrchk(cudaMemcpy2D((void*)d_in, d_in_pitch, (const void*)pSrc, nSrcStep, N * sizeof(float), M, cudaMemcpyHostToDevice)))
{
cudaFree((void*)d_in);
cudaFree((void*)d_out);
return 0;
}
if ((status = nppiFilterMedianGetBufferSize_32f_C1R(oSizeROI, oMaskSize, &pBufferSize)) != NPP_SUCCESS)
{
fprintf(stderr, "NPP Error: Failed to calculate buffer space for median filter operation\n");
cudaFree((void*)d_in);
cudaFree((void*)d_out);
return 0;
}
if (gpuErrchk(cudaMalloc((void**)&pBuffer, pBufferSize)))
{
fprintf(stderr, "NPP Error: Failed to allocate buffer space for median filter operation\n");
cudaFree((void*)d_in);
cudaFree((void*)d_out);
return 0;
}
if ((status = nppiFilterMedian_32f_C1R(d_in, d_in_pitch, d_out, d_out_pitch, oSizeROI, oMaskSize, oAnchor, pBuffer)) != NPP_SUCCESS)
{
fprintf(stderr, "NPP Error: Failed to call nppiFilterMedian_32f_C1R function\n");
cudaFree((void*)pBuffer);
cudaFree((void*)d_in);
cudaFree((void*)d_out);
return 0;
}
if (gpuErrchk(cudaMemcpy2D((void*)pDst, nDstStep, (const void*)d_out, d_out_pitch, sizeof(float) * N, M, cudaMemcpyDeviceToHost)))
{
cudaFree((void*)pBuffer);
cudaFree((void*)d_in);
cudaFree((void*)d_out);
return 0;
}
return 1;
}
compute-sanitizer shows many errors.
My main looks like this,
float* in = malloc(sizeof(float) * M * N);
float* out = malloc(sizeof(float) * M * N);
gpuMedfilt2(in, out, M, N, 5);
Thank you for any insight.
I feel like something is wrong with the cudaMallocPitch and the cudaMemcpy2D. I am not getting the memory sizes right?
Here is output from compute-sanitizer, this occurs around ~100+ times.
========= Invalid __global__ read of size 4 bytes
========= at 0x2b8 in void FilterMedianKernelSortingNetworkShared::RunKernel5x5<float, (int)1, (int)1, (int)25>(Pixel<T1, T2> *, int, NppiSize, NppiSize, const Pixel<T1, T2> *, int, int)
========= by thread (1,0,0) in block (0,2,0)
========= Address 0x701d1fffc is out of bounds
========= and is 4 bytes before the nearest allocation at 0x701d20000 of size 25,600 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x30b442]
========= in /lib/x86_64-linux-gnu/libcuda.so.1
========= Host Frame: [0x393adb]
========= in /home/rctodd/cuda11.7/lib/libnppif.so.11
========= Host Frame: [0x3ef278]
========= in /home/rctodd/cuda11.7/lib/libnppif.so.11
========= Host Frame: [0x140046]
========= in /home/rctodd/cuda11.7/lib/libnppif.so.11
========= Host Frame: [0x1401cb]
========= in /home/rctodd/cuda11.7/lib/libnppif.so.11
========= Host Frame:nppiFilterMedian_32f_C1R [0x12266f]
========= in /home/rctodd/cuda11.7/lib/libnppif.so.11
========= Host Frame:gpuMedfilt2 [0x1601]
========= in /home/rctodd/code/cuda/cuMedfilt2/libgpuMedfilt2.so
========= Host Frame:main [0x127e]
========= in /home/rctodd/code/cuda/cuMedfilt2/./app
========= Host Frame:../csu/libc-start.c:342:__libc_start_main [0x24083]
========= in /lib/x86_64-linux-gnu/libc.so.6
========= Host Frame:_start [0x110e]
========= in /home/rctodd/code/cuda/cuMedfilt2/./app
=========
ldd of my shared library I compile against
ldd libgpuMedfilt2.so
linux-vdso.so.1 (0x00007fff6533d000)
libcudart.so.11.0 => /home/rctodd/cuda11.7/lib/libcudart.so.11.0 (0x00007f1bb2911000)
libnppif.so.11 => /home/rctodd/cuda11.7/lib/libnppif.so.11 (0x00007f1badfbc000)
libstdc++.so.6 => /lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007f1baddc8000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f1badbd6000)
/lib64/ld-linux-x86-64.so.2 (0x00007f1bb2bbd000)
libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f1badbd0000)
libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f1badbad000)
librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007f1badba1000)
libnppc.so.11 => /home/rctodd/cuda11.7/lib/libnppc.so.11 (0x00007f1bad813000)
libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f1bad6c4000)
libgcc_s.so.1 => /home/rctodd/cuda11.7/lib/libgcc_s.so.1 (0x00007f1bad6ab000)
Here is my compile commands
# Shared library
/./home/rctodd/cuda11.7/bin/nvcc -o libgpuMedfilt2.so -shared gpuMedfilt2.cu --compiler-options '-fPIC' -Xlinker -L/home/rctodd/cuda11.7/lib -Xlinker -rpath=/home/rctodd/cuda11.7/lib -lcudart -lnppif -arch=sm_50
# Application
gcc -o app main.c -L$(pwd) -Wl,-rpath=$(pwd) -lgpuMedfilt2