SIMD optimization of cvtColor using ARM NEON intrinsics

Question

I'm working on a SIMD optimization of BGR to grayscale conversion which is equivalent to OpenCV's cvtColor() function. There is an Intel SSE version of this function and I'm referring to it. (What I'm doing is basically converting SSE code to NEON code.)

I've almost finished writing the code, and can compile it with g++, but I can't get the proper output. Does anyone have any ideas what the error could be?

What I'm getting (incorrect):

my output

What I should be getting:

properly converted output

Here's my code:

#include <opencv/cv.hpp>
#include <opencv/highgui.h>
#include <arm_neon.h>
//#include <iostream>

using namespace std;
//using namespace cv;

#define int8x16_to_8x8x2(v) ((int8x8x2_t) { vget_low_s8(v), vget_high_s8(v) })

void cvtBGR2GrayNEON(cv::Mat& src, cv::Mat& dest)
{
  const int size = src.size().area()*src.channels();
  uchar* s = src.ptr<uchar>(0);
  uchar* d = dest.ptr<uchar>(0);

  const int8x16_t mask1 = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
  const int8x16_t smask1 = {6,7,8,9,10,0,1,2,3,4,5,11,12,13,14,15};
  const int8x16_t ssmask1 = {11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10};

  const int8x16_t mask2 = {0,3,6,9,12,15, 2,5,8,11,14,1,4,7,10,13};
  const int8x16_t ssmask2 = {0,1,2,3,4,11,12,13,14,15,5,6,7,8,9,10};

  const int8x16_t bmask1 = {255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0};
  const int8x16_t bmask2 = {255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0};
  const int8x16_t bmask3 = {255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0};
  const int8x16_t bmask4 = {255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0};

  const int shift = 8;
  const int amp = 1<<shift;

  const int16_t _R_ = (int16_t)(amp*0.299);
  const int16_t _G_ = (int16_t)(amp*0.587);
  const int16_t _B_ = (int16_t)(amp*0.114);
  const int16x8_t R = vdupq_n_s16(_R_);
  const int16x8_t G = vdupq_n_s16(_G_);
  const int16x8_t B = vdupq_n_s16(_B_);
  const int8x16_t zero = vdupq_n_s8(0);

  for(int i = 0; i < size; i += 48)
    {
      int8x16_t a = vld1q_s8((int8_t *) s + i);
      int8x16_t b = vld1q_s8((int8_t *) s + i + 16);
      int8x16_t c = vld1q_s8((int8_t *) s + i + 32);

      a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a),vget_low_s8(mask1)),vtbl2_s8(int8x16_to_8x8x2(a),vget_high_s8(mask1)));
      b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b), vget_low_s8(mask2)), vtbl2_s8(int8x16_to_8x8x2(b), vget_high_s8(mask2)));
      c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c), vget_low_s8(mask2)), vtbl2_s8(int8x16_to_8x8x2(c), vget_high_s8(mask2)));

      //BBBBBB
      const int8x16_t aaaa = vbslq_s8(c, vbslq_s8(b, a, bmask1), bmask2);

      a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a), vget_low_s8(smask1)), vtbl2_s8(int8x16_to_8x8x2(a), vget_high_s8(smask1)));
      b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b), vget_low_s8(smask1)), vtbl2_s8(int8x16_to_8x8x2(b), vget_high_s8(smask1)));
      c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c), vget_low_s8(smask1)), vtbl2_s8(int8x16_to_8x8x2(c), vget_high_s8(smask1)));

      //GGGGGG
      const int8x16_t bbbb = vbslq_s8(c, vbslq_s8(b, a, bmask3), bmask2);

      a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a), vget_low_s8(ssmask1)), vtbl2_s8(int8x16_to_8x8x2(a), vget_high_s8(ssmask1)));
      c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c), vget_low_s8(ssmask1)), vtbl2_s8(int8x16_to_8x8x2(c), vget_high_s8(ssmask1)));
      b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b), vget_low_s8(ssmask2)), vtbl2_s8(int8x16_to_8x8x2(b), vget_high_s8(ssmask2)));

      //RRRRRR
      const int8x16_t cccc = vbslq_s8(c, vbslq_s8(b, a, bmask3), bmask4);

      /*
      int8x8x2_t a1 = vzip_s8(vget_high_s8(aaaa), vget_high_s8(zero));
      int8x8x2_t a2 = vzip_s8(vget_low_s8(aaaa), vget_low_s8(zero));
      */

      int8x16_t a1 = aaaa;
      int8x16_t a2 = zero;
      int8x16x2_t temp1 =  vzipq_s8(a1, a2);
      a1 = temp1.val[0];
      a2 = temp1.val[1];
      int16x8_t aa1 = vmulq_s16((int16x8_t)a2, B);
      int16x8_t aa2 = vmulq_s16((int16x8_t)a1, B);

      int8x16_t b1 = bbbb;
      int8x16_t b2 = zero;
      int8x16x2_t temp2 =  vzipq_s8(b1, b2);
      b1 = temp2.val[0];
      b2 = temp2.val[1];
      int16x8_t bb1 = vmulq_s16((int16x8_t)b2, G);
      int16x8_t bb2 = vmulq_s16((int16x8_t)b1, G);

      int8x16_t c1 = cccc;
      int8x16_t c2 = zero;
      int8x16x2_t temp3 =  vzipq_s8(c1, c2);
      c1 = temp3.val[0];
      c2 = temp3.val[1];
      int16x8_t cc1 = vmulq_s16((int16x8_t)c2, R);
      int16x8_t cc2 = vmulq_s16((int16x8_t)c1, R);

      aa1 = vaddq_s16(aa1, bb1);
      aa1 = vaddq_s16(aa1, cc1);
      aa2 = vaddq_s16(aa2, bb2);
      aa2 = vaddq_s16(aa2, cc2);

      const int shift1 = 8;
      aa1 = vshrq_n_s16(aa1, shift1);
      aa2 = vshrq_n_s16(aa2, shift1);

      uint8x8_t aaa1 = vqmovun_s16(aa1);
      uint8x8_t aaa2 = vqmovun_s16(aa2);

      uint8x16_t result = vcombine_u8(aaa1, aaa2);

      vst1q_u8((uint8_t *)(d), result);

      d+=16;
    }    
}

int main() 
{
  cv::Mat src = cv::imread("Lenna.bmp");
  cv::Mat dest(src.rows, src.cols, CV_8UC1);

  cvtBGR2GrayNEON(src, dest);

  cv::imwrite("grey.jpg", dest);

  return 0;
}

Here is equivalent SSE code (from here):

void cvtBGR2GraySSEShort(Mat& src, Mat& dest)
{
    const int size = src.size().area()*src.channels();
    uchar* s = src.ptr<uchar>(0);
    uchar* d = dest.ptr<uchar>(0);

    //data structure
    //BGR BGR BGR BGR BGR B
    //GR BGR BGR BGR BGR BG
    //R BGR BGR BGR BGR BGR
    //shuffle to BBBBBBGGGGGRRRRR
    const __m128i mask1 = _mm_setr_epi8(0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14);
    const __m128i smask1 = _mm_setr_epi8(6,7,8,9,10,0,1,2,3,4,5,11,12,13,14,15);
    const __m128i ssmask1 = _mm_setr_epi8(11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10);

    //shuffle to GGGGGGBBBBBRRRRR
    const __m128i mask2 = _mm_setr_epi8(0,3,6,9,12,15, 2,5,8,11,14,1,4,7,10,13);
    //const __m128i smask2 = _mm_setr_epi8(6,7,8,9,10,0,1,2,3,4,5,11,12,13,14,15);same as smask1
    const __m128i ssmask2 = _mm_setr_epi8(0,1,2,3,4,11,12,13,14,15,5,6,7,8,9,10);

    //shuffle to RRRRRRGGGGGBBBBB
    //__m128i mask3 = _mm_setr_epi8(0,3,6,9,12,15, 2,5,8,11,14,1,4,7,10,13);//same as mask2
    //const __m128i smask3 = _mm_setr_epi8(6,7,8,9,10,0,1,2,3,4,5,6,7,8,9,10);//same as smask1
    //const __m128i ssmask3 = _mm_setr_epi8(11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10);//same as ssmask1

    //blend mask
    const __m128i bmask1 = _mm_setr_epi8
        (255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0);

    const __m128i bmask2 = _mm_setr_epi8
        (255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0);

    const __m128i bmask3 = _mm_setr_epi8
        (255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0);

    const __m128i bmask4 = _mm_setr_epi8
        (255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0);  

    const int shift = 8;
    const int amp = 1<<shift;
    const int _R_=(int)(amp*0.299);
    const int _G_=(int)(amp*0.587);
    const int _B_=(int)(amp*0.114);
    const __m128i R = _mm_set1_epi16(_R_);
    const __m128i G = _mm_set1_epi16(_G_);
    const __m128i B = _mm_set1_epi16(_B_);
    const __m128i zero = _mm_setzero_si128();   

    for(int i=0;i<size;i+=48)
    {
        __m128i a = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i)),mask1);
        __m128i b = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i+16)),mask2);
        __m128i c = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i+32)),mask2);
        const __m128i aaaa = _mm_blendv_epi8(c,_mm_blendv_epi8(b,a,bmask1),bmask2);

        a = _mm_shuffle_epi8(a,smask1);
        b = _mm_shuffle_epi8(b,smask1);
        c = _mm_shuffle_epi8(c,smask1);
        const __m128i bbbb =_mm_blendv_epi8(c,_mm_blendv_epi8(b,a,bmask3),bmask2);

        a = _mm_shuffle_epi8(a,ssmask1);
        c = _mm_shuffle_epi8(c,ssmask1);
        b = _mm_shuffle_epi8(b,ssmask2);
        const __m128i cccc =_mm_blendv_epi8(c,_mm_blendv_epi8(b,a,bmask3),bmask4);

        __m128i a1 = _mm_unpackhi_epi8(aaaa,zero);
        __m128i a2 = _mm_unpacklo_epi8(aaaa,zero);
        a1 = _mm_mullo_epi16(a1,B);
        a2 = _mm_mullo_epi16(a2,B);
        __m128i b1 = _mm_unpackhi_epi8(bbbb,zero);
        __m128i b2 = _mm_unpacklo_epi8(bbbb,zero);
        b1 = _mm_mullo_epi16(b1,G);
        b2 = _mm_mullo_epi16(b2,G);

        __m128i c1 = _mm_unpackhi_epi8(cccc,zero);
        __m128i c2 = _mm_unpacklo_epi8(cccc,zero);
        c1 = _mm_mullo_epi16(c1,R);
        c2 = _mm_mullo_epi16(c2,R);

        a1 = _mm_add_epi16(a1,b1);
        a1 = _mm_add_epi16(a1,c1);
        a2 = _mm_add_epi16(a2,b2);
        a2 = _mm_add_epi16(a2,c2);

        a1 = _mm_srli_epi16(a1,8);
        a2 = _mm_srli_epi16(a2,8);

        a = _mm_packus_epi16(a1,a2);

        _mm_stream_si128((__m128i*)(d),a);
        d+=16;
    } 
}

Can I ask why you don't use the ARM NEON intrisics that map to the `VLD3` instruction? That spares you all of the shuffling, both simplifying and speeding up the code. The Intel SSE implementation requires shuffles because it lacks 2/3/4-way deinterleaving load instructions, but you shouldn't pass on them when they are available. — Iwillnotexist Idonotexist, Jul 27 '14 at 02:24
Thanks for telling me that.I didn't come up with that solution because I don't fully understand NEON instructions yet. 'VLD3' would reduce my code size and speed up the code. I'll try it. — S.Sato, Jul 27 '14 at 02:39
If you want to know more, the place to look for is the _ARM® Architecture Reference Manual, ARM®v7-A and ARM®v7-R edition_, Section A8.6 _Alphabetical list of instructions_. Have a look at all instructions whose name starts with V. — Iwillnotexist Idonotexist, Jul 27 '14 at 02:43
@S.Sato - I'm in the same position you are in... We have SSE code and we are moving it to NEON. The comment piqued my interest, too because we carried the loads and shuffles into NEON. This should help: [Coding for NEON - Part 1: Load and Stores](https://community.arm.com/groups/processors/blog/2010/03/17/coding-for-neon--part-1-load-and-stores). Also see [NEON, SSE and interleaving loads vs shuffles](http://stackoverflow.com/q/37106500) on Stack Overflow. — jww, May 09 '16 at 01:10

Jake 'Alquimista' LEE · Accepted Answer · 2014-07-27T04:34:54.343

Ok, below is a FULLY OPTIMIZED version of that function I just wrote (Beware that this function simply returns if size is smaller than 32.)

/*
 *  Created on: 2014. 7. 27.
 *      Author: Jake Lee
 *      Project FANIC - Fastest ARM NEON Implementaion Challenge
 */

// void fanicCvtBGR2GrayNEON(void *pDst, void *pSrc, unsigned int size);
// Y = 0.114*B + 0.587*G + 0.299*R
    .text
    .arm
    .global fanicCvtBGR2GrayNEON

    pDst    .req    r0
    pSrc    .req    r1
    size    .req    r2

    .align 5
    .func
fanicCvtBGR2GrayNEON:
    pld     [pSrc]
    subs    size, size, #32
    pld     [pSrc, #64]
    bxmi    lr
    pld     [pSrc, #64*2]
    vmov.i8     d0, #29
    vmov.i8     d1, #150
    vmov.i8     d2, #77

    .align 5
1:
    vld3.8      {d20, d21, d22}, [pSrc]!
    vld3.8      {d23, d24, d25}, [pSrc]!
    vld3.8      {d26, d27, d28}, [pSrc]!
    vld3.8      {d29, d30, d31}, [pSrc]!

    vmull.u8    q8, d20, d0
    vmlal.u8    q8, d21, d1
    vmlal.u8    q8, d22, d2
    vmull.u8    q9, d23, d0
    vmlal.u8    q9, d24, d1
    vmlal.u8    q9, d25, d2
    vmull.u8    q10, d26, d0
    vmlal.u8    q10, d27, d1
    vmlal.u8    q10, d28, d2
    vmull.u8    q11, d29, d0
    vmlal.u8    q11, d30, d1
    vmlal.u8    q11, d31, d2

    vrshrn.u16  d24, q8, #8
    vrshrn.u16  d25, q9, #8
    vrshrn.u16  d26, q10, #8
    vrshrn.u16  d27, q11, #8

    subs    size, size, #32
    pld     [pSrc, #64*3]
    pld     [pSrc, #64*4]

    vst1.8      {q12, q13}, [pDst]!
    bpl     1b

    cmp     size, #-32
    add     pSrc, pSrc, size
    bxle    lr
    add     pSrc, pSrc, size, lsl #1
    add     pDst, pDst, size
    b       1b

    .endfunc
    .end

As you can see, it's so much easier and shorter writing NEON codes in assembly than in intrinsics despite the heavy unrolling.

Have fun.

This takes the cake. Uses exactly what I'd have used – VLD3/VST1 with post-increment and fused multiply-accumulates. About the only way to improve on this would be potential instruction scheduling tricks. — Iwillnotexist Idonotexist, Jul 27 '14 at 07:15
@IwillnotexistIdonotexist : The only way removing the potential interlock at the end would be unrolling the loop further to 64 pixels per iteration. However, the number of instructions within the loop would exceed 32, thus the loop wouldn't be running in the core's loop buffer anymore, resulting in potentially less performance. — Jake 'Alquimista' LEE, Jul 27 '14 at 09:14
The only thing I would change is the cache preloading. Using PLD at the current read address won't get you anything. Depending on the speed of the CPU, it should be trying to preload at least 128 bytes ahead of where it's currently reading. — BitBank, Jul 28 '14 at 06:36
@Jake'Alquimista'LEE - Man you provide some awesome NEON code... Where did you pick up the skills? — jww, May 09 '16 at 02:13
Did you happen to benchmark this against the built in cvtColor? I'd be curious to know what kind of speedup one can expect. — nbubis, Aug 11 '21 at 07:23

SIMD optimization of cvtColor using ARM NEON intrinsics

1 Answers1

Linked