I've just tried to optimize an RGB to YUV420 converter. Using a lookup table yielded a speed increase, as did using fixed point arithmetic. However I was expecting the real gains using SSE instructions. My first go at it resulted in slower code and after chaining all the operations, it's approximately the same speed as the original code. Is there something wrong in my implementation or are SSE instructions just not suited to the task at hand?
A section of the original code follows:
#define RRGB24YUVCI2_00 0.299
#define RRGB24YUVCI2_01 0.587
#define RRGB24YUVCI2_02 0.114
#define RRGB24YUVCI2_10 -0.147
#define RRGB24YUVCI2_11 -0.289
#define RRGB24YUVCI2_12 0.436
#define RRGB24YUVCI2_20 0.615
#define RRGB24YUVCI2_21 -0.515
#define RRGB24YUVCI2_22 -0.100
void RealRGB24toYUV420Converter::Convert(void* pRgb, void* pY, void* pU, void* pV)
{
yuvType* py = (yuvType *)pY;
yuvType* pu = (yuvType *)pU;
yuvType* pv = (yuvType *)pV;
unsigned char* src = (unsigned char *)pRgb;
/// Y have range 0..255, U & V have range -128..127.
double u,v;
double r,g,b;
/// Step in 2x2 pel blocks. (4 pels per block).
int xBlks = _width >> 1;
int yBlks = _height >> 1;
for(int yb = 0; yb < yBlks; yb++)
for(int xb = 0; xb < xBlks; xb++)
{
int chrOff = yb*xBlks + xb;
int lumOff = (yb*_width + xb) << 1;
unsigned char* t = src + lumOff*3;
/// Top left pel.
b = (double)(*t++);
g = (double)(*t++);
r = (double)(*t++);
py[lumOff] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((int)(0.5 + RRGB24YUVCI2_00*r + RRGB24YUVCI2_01*g + RRGB24YUVCI2_02*b));
u = RRGB24YUVCI2_10*r + RRGB24YUVCI2_11*g + RRGB24YUVCI2_12*b;
v = RRGB24YUVCI2_20*r + RRGB24YUVCI2_21*g + RRGB24YUVCI2_22*b;
/// Top right pel.
b = (double)(*t++);
g = (double)(*t++);
r = (double)(*t++);
py[lumOff+1] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((int)(0.5 + RRGB24YUVCI2_00*r + RRGB24YUVCI2_01*g + RRGB24YUVCI2_02*b));
u += RRGB24YUVCI2_10*r + RRGB24YUVCI2_11*g + RRGB24YUVCI2_12*b;
v += RRGB24YUVCI2_20*r + RRGB24YUVCI2_21*g + RRGB24YUVCI2_22*b;
lumOff += _width;
t = t + _width*3 - 6;
/// Bottom left pel.
b = (double)(*t++);
g = (double)(*t++);
r = (double)(*t++);
py[lumOff] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((int)(0.5 + RRGB24YUVCI2_00*r + RRGB24YUVCI2_01*g + RRGB24YUVCI2_02*b));
u += RRGB24YUVCI2_10*r + RRGB24YUVCI2_11*g + RRGB24YUVCI2_12*b;
v += RRGB24YUVCI2_20*r + RRGB24YUVCI2_21*g + RRGB24YUVCI2_22*b;
/// Bottom right pel.
b = (double)(*t++);
g = (double)(*t++);
r = (double)(*t++);
py[lumOff+1] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((int)(0.5 + RRGB24YUVCI2_00*r + RRGB24YUVCI2_01*g + RRGB24YUVCI2_02*b));
u += RRGB24YUVCI2_10*r + RRGB24YUVCI2_11*g + RRGB24YUVCI2_12*b;
v += RRGB24YUVCI2_20*r + RRGB24YUVCI2_21*g + RRGB24YUVCI2_22*b;
/// Average the 4 chr values.
int iu = (int)u;
int iv = (int)v;
if(iu < 0) ///< Rounding.
iu -= 2;
else
iu += 2;
if(iv < 0) ///< Rounding.
iv -= 2;
else
iv += 2;
pu[chrOff] = (yuvType)( _chrOff + RRGB24YUVCI2_RANGECHECK_N128TO127(iu/4) );
pv[chrOff] = (yuvType)( _chrOff + RRGB24YUVCI2_RANGECHECK_N128TO127(iv/4) );
}//end for xb & yb...
}//end Convert.
And here is the version using SSE
const float fRRGB24YUVCI2_00 = 0.299;
const float fRRGB24YUVCI2_01 = 0.587;
const float fRRGB24YUVCI2_02 = 0.114;
const float fRRGB24YUVCI2_10 = -0.147;
const float fRRGB24YUVCI2_11 = -0.289;
const float fRRGB24YUVCI2_12 = 0.436;
const float fRRGB24YUVCI2_20 = 0.615;
const float fRRGB24YUVCI2_21 = -0.515;
const float fRRGB24YUVCI2_22 = -0.100;
void RealRGB24toYUV420Converter::Convert(void* pRgb, void* pY, void* pU, void* pV)
{
__m128 xmm_y = _mm_loadu_ps(fCOEFF_0);
__m128 xmm_u = _mm_loadu_ps(fCOEFF_1);
__m128 xmm_v = _mm_loadu_ps(fCOEFF_2);
yuvType* py = (yuvType *)pY;
yuvType* pu = (yuvType *)pU;
yuvType* pv = (yuvType *)pV;
unsigned char* src = (unsigned char *)pRgb;
/// Y have range 0..255, U & V have range -128..127.
float bgr1[4];
bgr1[3] = 0.0;
float bgr2[4];
bgr2[3] = 0.0;
float bgr3[4];
bgr3[3] = 0.0;
float bgr4[4];
bgr4[3] = 0.0;
/// Step in 2x2 pel blocks. (4 pels per block).
int xBlks = _width >> 1;
int yBlks = _height >> 1;
for(int yb = 0; yb < yBlks; yb++)
for(int xb = 0; xb < xBlks; xb++)
{
int chrOff = yb*xBlks + xb;
int lumOff = (yb*_width + xb) << 1;
unsigned char* t = src + lumOff*3;
bgr1[2] = (float)*t++;
bgr1[1] = (float)*t++;
bgr1[0] = (float)*t++;
bgr2[2] = (float)*t++;
bgr2[1] = (float)*t++;
bgr2[0] = (float)*t++;
t = t + _width*3 - 6;
bgr3[2] = (float)*t++;
bgr3[1] = (float)*t++;
bgr3[0] = (float)*t++;
bgr4[2] = (float)*t++;
bgr4[1] = (float)*t++;
bgr4[0] = (float)*t++;
__m128 xmm1 = _mm_loadu_ps(bgr1);
__m128 xmm2 = _mm_loadu_ps(bgr2);
__m128 xmm3 = _mm_loadu_ps(bgr3);
__m128 xmm4 = _mm_loadu_ps(bgr4);
// Y
__m128 xmm_res_y = _mm_mul_ps(xmm1, xmm_y);
py[lumOff] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((xmm_res_y.m128_f32[0] + xmm_res_y.m128_f32[1] + xmm_res_y.m128_f32[2] ));
// Y
xmm_res_y = _mm_mul_ps(xmm2, xmm_y);
py[lumOff + 1] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((xmm_res_y.m128_f32[0] + xmm_res_y.m128_f32[1] + xmm_res_y.m128_f32[2] ));
lumOff += _width;
// Y
xmm_res_y = _mm_mul_ps(xmm3, xmm_y);
py[lumOff] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((xmm_res_y.m128_f32[0] + xmm_res_y.m128_f32[1] + xmm_res_y.m128_f32[2] ));
// Y
xmm_res_y = _mm_mul_ps(xmm4, xmm_y);
py[lumOff+1] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((xmm_res_y.m128_f32[0] + xmm_res_y.m128_f32[1] + xmm_res_y.m128_f32[2] ));
// U
__m128 xmm_res = _mm_add_ps(
_mm_add_ps(_mm_mul_ps(xmm1, xmm_u), _mm_mul_ps(xmm2, xmm_u)),
_mm_add_ps(_mm_mul_ps(xmm3, xmm_u), _mm_mul_ps(xmm4, xmm_u))
);
float fU = xmm_res.m128_f32[0] + xmm_res.m128_f32[1] + xmm_res.m128_f32[2];
// V
xmm_res = _mm_add_ps(
_mm_add_ps(_mm_mul_ps(xmm1, xmm_v), _mm_mul_ps(xmm2, xmm_v)),
_mm_add_ps(_mm_mul_ps(xmm3, xmm_v), _mm_mul_ps(xmm4, xmm_v))
);
float fV = xmm_res.m128_f32[0] + xmm_res.m128_f32[1] + xmm_res.m128_f32[2];
/// Average the 4 chr values.
int iu = (int)fU;
int iv = (int)fV;
if(iu < 0) ///< Rounding.
iu -= 2;
else
iu += 2;
if(iv < 0) ///< Rounding.
iv -= 2;
else
iv += 2;
pu[chrOff] = (yuvType)( _chrOff + RRGB24YUVCI2_RANGECHECK_N128TO127(iu >> 2) );
pv[chrOff] = (yuvType)( _chrOff + RRGB24YUVCI2_RANGECHECK_N128TO127(iv >> 2) );
}//end for xb & yb...
}
This is one of my first attempts at SSE2 so perhaps I'm missing something? FYI I am working on the Windows platform using Visual Studio 2008.