1

I am a newer to cuda and its image,signal processing library :NPP ,now I am trying to convert YUV420 to BGR ,use this function:

NppStatus nppiYUV420ToBGR_8u_P3C3R(const Npp8u * const pSrc[3], int rSrcStep[3], Npp8u * pDst, int nDstStep, NppiSize oSizeROI);

but I can't decide the rSrcStep , I know it's the row size of each component ,Y U V, but not sure I really understand it , the original image size is 1920x1080 (wxh) ,I use a opencv Mat to contain the YUV image

cv::Mat(cv::Size(1920,1080*3/2), CV_8UC1, (void*)data)

,then for the parameter rSrcStep of the 1st function ,I try rSrcStep={1920,1920/2,1920/2} ,but it returns NPP_STEP_ERROR

ps: for the nDestStep , I use the below function to allocate dest buff ,and get the step at same time

Npp8u  * 

    nppiMalloc_8u_C3(int nWidthPixels, int nHeightPixels, int * pStepBytes);

1080*(3/2) because YUV420 size is wh3/2 bytes when original RGB image is w*h

wangz
  • 45
  • 9

1 Answers1

1

Set rSrcStep and nDstStep to the following values:

int rSrcStep[3] = { COLS, COLS / 2, COLS / 2 };
int nDstStep = COLS * 3;

Where COLS = 1920.


In YUV420 planar format (I420), the resolution of Y channel is full resolution, and the resolution of U and V channels is half resolution in each axis.

Example:
Y:
enter image description here

U:
enter image description here

V:
enter image description here

Assuming the data is continuous in memory, the step (row stride in bytes) of Y equals to image width, and the step of U and V equals width/2.


Testing:
The testing code uses FFmpeg for building the input image in raw I420 format, and uses FFmpeg for converting the raw BGR output to PNG image.

#include <stdint.h>
#include <stdio.h>
#include "nppi.h"

#define COLS 192
#define ROWS 108

uint8_t Y[COLS * ROWS];       //Y color channel in host memory
uint8_t U[COLS * ROWS / 4];   //U color channel in host memory
uint8_t V[COLS * ROWS / 4];   //V color channel in host memory

uint8_t BGR[COLS * ROWS * 3];   //BGR output image in host memory

int main()
{
    //Read Y, U, V planes to host memory buffers.
    //Build input sample using FFmpeg first:
    //ffmpeg -y -f lavfi -i testsrc=size=192x108:rate=1:duration=1 -pix_fmt yuvj420p -f rawvideo in.yuv420p
    ////////////////////////////////////////////////////////////////////////////
    FILE* f = fopen("in.yuv420p", "rb");
    fread(Y, 1, COLS * ROWS, f);
    fread(U, 1, COLS * ROWS / 4, f);
    fread(V, 1, COLS * ROWS / 4, f);
    fclose(f);
    ////////////////////////////////////////////////////////////////////////////

    //Allocate device memory, and copy Y,U,V from host to device.
    ////////////////////////////////////////////////////////////////////////////
    Npp8u* gpuY, * gpuU, * gpuV, * gpuBGR;
    cudaMalloc(&gpuY, COLS * ROWS);
    cudaMalloc(&gpuU, COLS * ROWS / 4);
    cudaMalloc(&gpuV, COLS * ROWS / 4);
    cudaMalloc(&gpuBGR, COLS * ROWS * 3);
    cudaMemcpy(gpuY, Y, COLS * ROWS, cudaMemcpyHostToDevice);
    cudaMemcpy(gpuU, U, COLS * ROWS / 4, cudaMemcpyHostToDevice);
    cudaMemcpy(gpuV, V, COLS * ROWS / 4, cudaMemcpyHostToDevice);
    ////////////////////////////////////////////////////////////////////////////


    //Execute nppiYUV420ToBGR_8u_P3C3R
    ////////////////////////////////////////////////////////////////////////////
    const Npp8u* const pSrc[3] = { gpuY, gpuU, gpuV };
    int rSrcStep[3] = { COLS, COLS / 2, COLS / 2 };
    int nDstStep = COLS * 3;
    NppiSize oSizeROI = { COLS, ROWS };

    NppStatus sts = nppiYUV420ToBGR_8u_P3C3R(pSrc,      //const Npp8u* const pSrc[3], 
                                             rSrcStep,  //int rSrcStep[3], 
                                             gpuBGR,    //Npp8u *pDst, 
                                             nDstStep,  //int nDstStep, 
                                             oSizeROI); //NppiSize oSizeROI);

    if (sts != NPP_SUCCESS) 
    {
        printf("Error: nppiResize_8u_C3R status = %d\n", (int)sts);
    }
    ////////////////////////////////////////////////////////////////////////////


    // Copy BGR output from device to host, and save BGR output to binary file
    // After saving, use FFmpeg to convert the output image from binary to PNG:
    // ffmpeg -y -f rawvideo -video_size 192x108 -pixel_format bgr24 -i out.bgr out.png
    ////////////////////////////////////////////////////////////////////////////
    cudaMemcpy(BGR, gpuBGR, COLS * ROWS * 3, cudaMemcpyDeviceToHost);
    f = fopen("out.bgr", "wb");
    fwrite(BGR, 1, COLS * ROWS * 3, f);
    fclose(f);
    ////////////////////////////////////////////////////////////////////////////


    cudaFree(&gpuY);
    cudaFree(&gpuU);
    cudaFree(&gpuV);
    cudaFree(&gpuBGR);

    return 0;
}

Output (out.png):
enter image description here

Rotem
  • 30,366
  • 4
  • 32
  • 65
  • hi ,thanks ,but I use the code above , with dstStep as cols*3 , the nppi function still returns -14 , NPP_STEP_ERROR , the only difference is I read data from opencv::Mat ,not from a local file ,and the col and rows is 1920 1080 not 192x108 – wangz Sep 02 '22 at 01:06
  • Did you test my code sample as is? Did you test my code sample with size 1920x1080? The steps are correct... In case you are using `nppiMalloc_8u_C3`, use it as follows: `int pStepBytes[1] = { COLS * 3 }; gpuBGR = nppiMalloc_8u_C3(COLS, ROWS, pStepBytes);`. In case you still need help, please post a reproducible code same (sample that we can be build and execute). – Rotem Sep 02 '22 at 08:12