Was wondering if any one more versed in how to call this NPP cuda function could tell me where the mistake is occurring?
#define gpuErrchk(ans) gpuAssert((ans), __FILE__, __LINE__)
inline int gpuAssert(cudaError_t code, const char *file, int line)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
return 1;
}
return 0;
}
int gpuMedfilt2(const float* pSrc, float* pDst, int M, int N, int winSize)
{
NppStatus status;
Npp32f* d_in, *d_out;
Npp32s nSrcStep = N * sizeof(float), nDstStep = N * sizeof(float);
NppiSize oSizeROI = {N, M};
NppiSize oMaskSize = {winSize, winSize};
NppiPoint oAnchor = {oMaskSize.width / 2, oMaskSize.height / 2};
Npp8u* pBuffer;
Npp32u pBufferSize;
size_t d_in_pitch, d_out_pitch;
if (gpuErrchk(cudaMallocPitch((void**)&d_in, &d_in_pitch, N * sizeof(float), M)))
return 0;
if (gpuErrchk(cudaMallocPitch((void**)&d_out, &d_out_pitch, N * sizeof(float), M)))
{
cudaFree((void*)d_in);
return 0;
}
if (gpuErrchk(cudaMemcpy2D((void*)d_in, d_in_pitch, (const void*)pSrc, nSrcStep, N * sizeof(float), M, cudaMemcpyHostToDevice)))
{
cudaFree((void*)d_in);
cudaFree((void*)d_out);
return 0;
}
if ((status = nppiFilterMedianGetBufferSize_32f_C1R(oSizeROI, oMaskSize, &pBufferSize)) != NPP_SUCCESS)
{
fprintf(stderr, "NPP Error: Failed to calculate buffer space for median filter operation\n");
cudaFree((void*)d_in);
cudaFree((void*)d_out);
return 0;
}
if (gpuErrchk(cudaMalloc((void**)&pBuffer, pBufferSize)))
{
fprintf(stderr, "NPP Error: Failed to allocate buffer space for median filter operation\n");
cudaFree((void*)d_in);
cudaFree((void*)d_out);
return 0;
}
if ((status = nppiFilterMedian_32f_C1R(d_in, d_in_pitch, d_out, d_out_pitch, oSizeROI, oMaskSize, oAnchor, pBuffer)) != NPP_SUCCESS)
{
fprintf(stderr, "NPP Error: Failed to call nppiFilterMedian_32f_C1R function\n");
cudaFree((void*)pBuffer);
cudaFree((void*)d_in);
cudaFree((void*)d_out);
return 0;
}
if (gpuErrchk(cudaMemcpy2D((void*)pDst, nDstStep, (const void*)d_out, d_out_pitch, sizeof(float) * N, M, cudaMemcpyDeviceToHost)))
{
cudaFree((void*)pBuffer);
cudaFree((void*)d_in);
cudaFree((void*)d_out);
return 0;
}
return 1;
}
compute-sanitizer shows many errors.
My main looks like this,
float* in = malloc(sizeof(float) * M * N);
float* out = malloc(sizeof(float) * M * N);
gpuMedfilt2(in, out, M, N, 5);
Thank you for any insight.
I feel like something is wrong with the cudaMallocPitch and the cudaMemcpy2D. I am not getting the memory sizes right?
Here is output from compute-sanitizer, this occurs around ~100+ times.
========= Invalid __global__ read of size 4 bytes
========= at 0x2b8 in void FilterMedianKernelSortingNetworkShared::RunKernel5x5<float, (int)1, (int)1, (int)25>(Pixel<T1, T2> *, int, NppiSize, NppiSize, const Pixel<T1, T2> *, int, int)
========= by thread (1,0,0) in block (0,2,0)
========= Address 0x701d1fffc is out of bounds
========= and is 4 bytes before the nearest allocation at 0x701d20000 of size 25,600 bytes
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame: [0x30b442]
========= in /lib/x86_64-linux-gnu/libcuda.so.1
========= Host Frame: [0x393adb]
========= in /home/rctodd/cuda11.7/lib/libnppif.so.11
========= Host Frame: [0x3ef278]
========= in /home/rctodd/cuda11.7/lib/libnppif.so.11
========= Host Frame: [0x140046]
========= in /home/rctodd/cuda11.7/lib/libnppif.so.11
========= Host Frame: [0x1401cb]
========= in /home/rctodd/cuda11.7/lib/libnppif.so.11
========= Host Frame:nppiFilterMedian_32f_C1R [0x12266f]
========= in /home/rctodd/cuda11.7/lib/libnppif.so.11
========= Host Frame:gpuMedfilt2 [0x1601]
========= in /home/rctodd/code/cuda/cuMedfilt2/libgpuMedfilt2.so
========= Host Frame:main [0x127e]
========= in /home/rctodd/code/cuda/cuMedfilt2/./app
========= Host Frame:../csu/libc-start.c:342:__libc_start_main [0x24083]
========= in /lib/x86_64-linux-gnu/libc.so.6
========= Host Frame:_start [0x110e]
========= in /home/rctodd/code/cuda/cuMedfilt2/./app
=========
ldd of my shared library I compile against
ldd libgpuMedfilt2.so
linux-vdso.so.1 (0x00007fff6533d000)
libcudart.so.11.0 => /home/rctodd/cuda11.7/lib/libcudart.so.11.0 (0x00007f1bb2911000)
libnppif.so.11 => /home/rctodd/cuda11.7/lib/libnppif.so.11 (0x00007f1badfbc000)
libstdc++.so.6 => /lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007f1baddc8000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f1badbd6000)
/lib64/ld-linux-x86-64.so.2 (0x00007f1bb2bbd000)
libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f1badbd0000)
libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f1badbad000)
librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007f1badba1000)
libnppc.so.11 => /home/rctodd/cuda11.7/lib/libnppc.so.11 (0x00007f1bad813000)
libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f1bad6c4000)
libgcc_s.so.1 => /home/rctodd/cuda11.7/lib/libgcc_s.so.1 (0x00007f1bad6ab000)
Here is my compile commands
# Shared library
/./home/rctodd/cuda11.7/bin/nvcc -o libgpuMedfilt2.so -shared gpuMedfilt2.cu --compiler-options '-fPIC' -Xlinker -L/home/rctodd/cuda11.7/lib -Xlinker -rpath=/home/rctodd/cuda11.7/lib -lcudart -lnppif -arch=sm_50
# Application
gcc -o app main.c -L$(pwd) -Wl,-rpath=$(pwd) -lgpuMedfilt2
The "ordinary" filtering functions provided by NPP expect that any placement of the mask/filter kernel will land on properly defined pixels in the image. The ramification of this is that you cannot filter an input image edge-to-edge this way. You must leave an unfiltered boundary, the size of which will depend on your mask/filter kernel dimensions. (Some NPP filter functions provide a
Boundaryvariant which will have "automatic" handling of boundary pixels, i.e. pixels needed for calculation but which fall outside of the defined image, but median filter is not one of those. )Your code violates this expectation, so its not surprising that
compute-sanitizerreports illegal, out-of-bounds accesses.A typical method to address this expectation is to restrict the filter to a region that "fits within" the original image, leaving enough border area of defined pixels so that the placement of the filter within the filter region always selects defined pixels (from the original image) within the filter kernel area.
The choice of filter kernel anchor pixel will affect this, but you have chosen a "typical" anchor at the center of the filter kernel.
Therefore, in your case we can filter a "central region", leaving a boundary of 2 pixels unfiltered, at the top, bottom, left, and right of the original image, resulting in a filtered image that is 4 pixels less than the original dimensions for horizontal and vertical
Here is a worked example. The median filter is interesting for several reasons. One of its capabilities is to leave image edges "intact" while still offering something like a "low-pass" filter effect for "noise" having certain properties. The following example demonstrates that:
We note that the additive noise is gone, and the vertical edge is "intact" (i.e. unchanged) at the center of the image. We also note that there appears to be a boundary of 2 pixels all the way around, which are unfiltered (and set to 0 by the
cudaMemsetoperation). Another possible method to handle the output border region (rather than setting it to zero) would be to copy the input image to the output image, before the filtering operation, in place of thecudaMemsetoperation, effectively setting output pixels equal to input pixels, in the border region.