cuda nppiResize() for RGB images

1.5k Views Asked by At

nVidia Cuda nppiResize_32f_C1R works OK on grayscale 1 x 32f, but nppiResize_32f_C3R returns garbage. Clearly, a work around would be to call this routine 3 times by first de-interleaving the data as planar R, G, B, but I was expecting to be able to run it through in a single pass. nVidia has a lot of example code for single plane image processing, but a scant amount for interleaved color planes, so I'm turning to stackoverflow for help. I don't know how the stride is computed, but I understand that the stride is the image width times the number of bytes per column index. So in my case - with no padded lines - it should be width 32f x 3 for RGB.

Tried different strides/pitches in cudaMemcpy2D(). Can't get a workable solution for the color RGB code. Compiles & runs OK, no errors. The first section is for Grayscale (works OK). The second section is RGB (garbage).

    // nppiResize using 2D aligned allocations

    #include <Exceptions.h>
    #include <cuda_runtime.h>
    #include <npp.h>
    #include <nppi.h>
    #include <nppdefs.h>

    #define CUDA_CALL(call) do { cudaError_t cuda_error = call; if(cuda_error != cudaSuccess) { std::cerr << "CUDA Error: " << cudaGetErrorString(cuda_error) << ", " << __FILE__ << ", line " << __LINE__ << std::endl; return(NULL);} } while(0)

    float* decimate_cuda(float* readbuff, uint32_t nSrcH, uint32_t nSrcW, uint32_t nDstH, uint32_t nDstW, uint8_t byteperpixel)
    {
        if (byteperpixel == 1){ // source : Grayscale, 1 x 32f
            size_t  srcStep; 
            size_t  dstStep;

            NppiSize oSrcSize = {nSrcW, nSrcH};
            NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
            float *devSrc;
            CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, nSrcW * sizeof(float), nSrcH));
            CUDA_CALL(cudaMemcpy2D((void**)devSrc, srcStep,(void**)readbuff, nSrcW * sizeof(Npp32f), nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));

            NppiSize oDstSize = {nDstW, nDstH};     
            NppiRect oDstROI = {0, 0, nDstW, nDstH}; 
            float *devDst;
            CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, nDstW * sizeof(float), nDstH));

NppStatus result = nppiResize_32f_C1R(devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
            if (result != NPP_SUCCESS) {
                std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
            }

            Npp64s                 writesize;
            Npp32f                 *hostDst;
            writesize = (Npp64s)   nDstW * nDstH;         // Y
            if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
                printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
                exit(1);
            }

            CUDA_CALL(cudaMemcpy2D(hostDst, nDstW * sizeof(Npp32f),(void**)devDst, dstStep, nDstW * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));
            CUDA_CALL(cudaFree(devSrc));
            CUDA_CALL(cudaFree(devDst));
            return(hostDst);
        }                            // source : Grayscale 1 x 32f, YYYY...
        else if (byteperpixel == 3){ // source : 3 x 32f interleaved RGBRGBRGB...
            size_t  srcStep; 
            size_t  dstStep;
            // rows = height; columns = width

            NppiSize oSrcSize = {nSrcW, nSrcH};
            NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
            float *devSrc;
CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, 3 * nSrcW * sizeof(float), nSrcH));
CUDA_CALL(cudaMemcpy2D((void**)devSrc, srcStep, (void**)readbuff, 3 * nSrcW * sizeof(Npp32f), nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));

            NppiSize oDstSize = {nDstW, nDstH}; 
            NppiRect oDstROI = {0, 0, nDstW, nDstH};
            float *devDst;
CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, 3 * nDstW * sizeof(float), nDstH));

NppStatus result = nppiResize_32f_C3R((devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
if (result != NPP_SUCCESS) {
                std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
            }

            Npp64s                 writesize;
            Npp32f                 *hostDst;
            writesize = (Npp64s)   nDstW * nDstH * 3;          // RGB
            if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
                printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
                exit(1);
            }

            CUDA_CALL(cudaMemcpy2D((void**)hostDst, nDstW * sizeof(Npp32f), (void**)devDst, dstStep, nDstW * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));

            CUDA_CALL(cudaFree(devSrc));
            CUDA_CALL(cudaFree(devDst));
            return(hostDst);
        }        // source - 3 x 32f, interleaved RGBRGBRGB...

        return(0);
    }
1

There are 1 best solutions below

0
Robert Crovella On

You had various errors in your calls to cudaMemcpy2D (both of them, in the 3 channel code). This code seems to work for me:

$ cat t1521.cu
    #include <cuda_runtime.h>
    #include <npp.h>
    #include <nppi.h>
    #include <nppdefs.h>
    #include <iostream>
    #include <stdint.h>
    #include <stdio.h>
    #define CUDA_CALL(call) do { cudaError_t cuda_error = call; if(cuda_error != cudaSuccess) { std::cerr << "CUDA Error: " << cudaGetErrorString(cuda_error) << ", " << __FILE__ << ", line " << __LINE__ << std::endl; return(NULL);} } while(0)
    using namespace std;
    float* decimate_cuda(float* readbuff, uint32_t nSrcH, uint32_t nSrcW, uint32_t nDstH, uint32_t nDstW, uint8_t byteperpixel)
    {
        if (byteperpixel == 1){ // source : Grayscale, 1 x 32f
            size_t  srcStep;
            size_t  dstStep;

            NppiSize oSrcSize = {nSrcW, nSrcH};
            NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
            float *devSrc;
            CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, nSrcW * sizeof(float), nSrcH));
            CUDA_CALL(cudaMemcpy2D(devSrc, srcStep,readbuff, nSrcW * sizeof(Npp32f), nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));

            NppiSize oDstSize = {nDstW, nDstH};
            NppiRect oDstROI = {0, 0, nDstW, nDstH};
            float *devDst;
            CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, nDstW * sizeof(float), nDstH));

            NppStatus result = nppiResize_32f_C1R(devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
            if (result != NPP_SUCCESS) {
                std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
            }

            Npp64s                 writesize;
            Npp32f                 *hostDst;
            writesize = (Npp64s)   nDstW * nDstH;         // Y
            if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
                printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
                exit(1);
            }

            CUDA_CALL(cudaMemcpy2D(hostDst, nDstW * sizeof(Npp32f),devDst, dstStep, nDstW * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));
            CUDA_CALL(cudaFree(devSrc));
            CUDA_CALL(cudaFree(devDst));
            return(hostDst);
        }                            // source : Grayscale 1 x 32f, YYYY...
        else if (byteperpixel == 3){ // source : 3 x 32f interleaved RGBRGBRGB...
            size_t  srcStep;
            size_t  dstStep;
            // rows = height; columns = width

            NppiSize oSrcSize = {nSrcW, nSrcH};
            NppiRect oSrcROI = {0, 0, nSrcW, nSrcH};
            float *devSrc;
            CUDA_CALL(cudaMallocPitch((void**)&devSrc, &srcStep, 3 * nSrcW * sizeof(float), nSrcH));
            CUDA_CALL(cudaMemcpy2D(devSrc, srcStep,readbuff, 3 * nSrcW * sizeof(Npp32f), 3*nSrcW * sizeof(Npp32f), nSrcH, cudaMemcpyHostToDevice));

            NppiSize oDstSize = {nDstW, nDstH};
            NppiRect oDstROI = {0, 0, nDstW, nDstH};
            float *devDst;
            CUDA_CALL(cudaMallocPitch((void**)&devDst, &dstStep, 3 * nDstW * sizeof(float), nDstH));

            NppStatus result = nppiResize_32f_C3R(devSrc,srcStep,oSrcSize,oSrcROI,devDst,dstStep,oDstSize,oDstROI,NPPI_INTER_SUPER);
            if (result != NPP_SUCCESS) {
                std::cerr << "Unable to run decimate_cuda, error " << result << std::endl;
            }

            Npp64s                 writesize;
            Npp32f                 *hostDst;
            writesize = (Npp64s)   nDstW * nDstH * 3;          // RGB
            if(NULL == (hostDst = (Npp32f *)malloc(writesize * sizeof(Npp32f)))){
                printf("Error : Unable to alloctae hostDst in decimate_cuda, exiting...\n");
                exit(1);
            }

            CUDA_CALL(cudaMemcpy2D(hostDst, nDstW*3 * sizeof(Npp32f), devDst, dstStep, nDstW*3 * sizeof(Npp32f),nDstH, cudaMemcpyDeviceToHost));

            CUDA_CALL(cudaFree(devSrc));
            CUDA_CALL(cudaFree(devDst));
            return(hostDst);
        }        // source - 3 x 32f, interleaved RGBRGBRGB...

        return(0);
    }

int main(){
    uint32_t nSrcH = 480;
    uint32_t nSrcW = 640;
    uint8_t byteperpixel = 3;
    float *readbuff = (float *)malloc(nSrcW*nSrcH*byteperpixel*sizeof(float));
    for (int i = 0; i < nSrcH*nSrcW; i++){
      readbuff [i*3+0] = 1.0f;
      readbuff [i*3+1] = 2.0f;
      readbuff [i*3+2] = 3.0f;}
    uint32_t nDstW = nSrcW/2;
    uint32_t nDstH = nSrcH/2;
    float *res =  decimate_cuda(readbuff, nSrcH, nSrcW, nDstH, nDstW, byteperpixel);
    for (int i = 0; i < nDstH*nDstW*byteperpixel; i++) if (res[i] != ((i%3)+1.0f)) {std::cout << "error at: " << i << std::endl; return 0;}
    return 0;
}
$ nvcc -o t1521 t1521.cu -lnppig
$ cuda-memcheck ./t1521
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

In the future, its convenient if you provide a complete code, just as I have done in my answer. In fact SO requires this, see item 1 here.

By the way, the use of pitched allocations on the device, here, which introduce complexity that you were not able to work your way through, should really be unnecessary both for correctness and performance, using any modern GPU and CUDA version. Ordinary linear/flat allocations, where pitch==width, should be just fine.