Set rSrcStep
and nDstStep
to the following values:
int rSrcStep[3] = { COLS, COLS / 2, COLS / 2 };
int nDstStep = COLS * 3;
Where COLS = 1920
.
In YUV420 planar format (I420), the resolution of Y channel is full resolution, and the resolution of U and V channels is half resolution in each axis.
Example:
Y:

U:

V:

Assuming the data is continuous in memory, the step (row stride in bytes) of Y equals to image width, and the step of U and V equals width/2.
Testing:
The testing code uses FFmpeg for building the input image in raw I420 format, and uses FFmpeg for converting the raw BGR output to PNG image.
#include <stdint.h>
#include <stdio.h>
#include "nppi.h"
#define COLS 192
#define ROWS 108
uint8_t Y[COLS * ROWS]; //Y color channel in host memory
uint8_t U[COLS * ROWS / 4]; //U color channel in host memory
uint8_t V[COLS * ROWS / 4]; //V color channel in host memory
uint8_t BGR[COLS * ROWS * 3]; //BGR output image in host memory
int main()
{
//Read Y, U, V planes to host memory buffers.
//Build input sample using FFmpeg first:
//ffmpeg -y -f lavfi -i testsrc=size=192x108:rate=1:duration=1 -pix_fmt yuvj420p -f rawvideo in.yuv420p
////////////////////////////////////////////////////////////////////////////
FILE* f = fopen("in.yuv420p", "rb");
fread(Y, 1, COLS * ROWS, f);
fread(U, 1, COLS * ROWS / 4, f);
fread(V, 1, COLS * ROWS / 4, f);
fclose(f);
////////////////////////////////////////////////////////////////////////////
//Allocate device memory, and copy Y,U,V from host to device.
////////////////////////////////////////////////////////////////////////////
Npp8u* gpuY, * gpuU, * gpuV, * gpuBGR;
cudaMalloc(&gpuY, COLS * ROWS);
cudaMalloc(&gpuU, COLS * ROWS / 4);
cudaMalloc(&gpuV, COLS * ROWS / 4);
cudaMalloc(&gpuBGR, COLS * ROWS * 3);
cudaMemcpy(gpuY, Y, COLS * ROWS, cudaMemcpyHostToDevice);
cudaMemcpy(gpuU, U, COLS * ROWS / 4, cudaMemcpyHostToDevice);
cudaMemcpy(gpuV, V, COLS * ROWS / 4, cudaMemcpyHostToDevice);
////////////////////////////////////////////////////////////////////////////
//Execute nppiYUV420ToBGR_8u_P3C3R
////////////////////////////////////////////////////////////////////////////
const Npp8u* const pSrc[3] = { gpuY, gpuU, gpuV };
int rSrcStep[3] = { COLS, COLS / 2, COLS / 2 };
int nDstStep = COLS * 3;
NppiSize oSizeROI = { COLS, ROWS };
NppStatus sts = nppiYUV420ToBGR_8u_P3C3R(pSrc, //const Npp8u* const pSrc[3],
rSrcStep, //int rSrcStep[3],
gpuBGR, //Npp8u *pDst,
nDstStep, //int nDstStep,
oSizeROI); //NppiSize oSizeROI);
if (sts != NPP_SUCCESS)
{
printf("Error: nppiResize_8u_C3R status = %d\n", (int)sts);
}
////////////////////////////////////////////////////////////////////////////
// Copy BGR output from device to host, and save BGR output to binary file
// After saving, use FFmpeg to convert the output image from binary to PNG:
// ffmpeg -y -f rawvideo -video_size 192x108 -pixel_format bgr24 -i out.bgr out.png
////////////////////////////////////////////////////////////////////////////
cudaMemcpy(BGR, gpuBGR, COLS * ROWS * 3, cudaMemcpyDeviceToHost);
f = fopen("out.bgr", "wb");
fwrite(BGR, 1, COLS * ROWS * 3, f);
fclose(f);
////////////////////////////////////////////////////////////////////////////
cudaFree(&gpuY);
cudaFree(&gpuU);
cudaFree(&gpuV);
cudaFree(&gpuBGR);
return 0;
}
Output (out.png
):
