-1

I am programming a perfect program to parallelize with multimedia extensions. The program consists of transforming an image, so i go over a matrix and i modify each pixel inside it. For go over faster, i use multimedia extensions:

At first i used SSE3 extensions and achieved a 2.5 speedup. Next, i programmed it extending the sse algorithm for using AVX extensions (Double size vector), but i do not get gains respect to SSE3. More or less the time to execute the program with SSE is the same as AVX. Here is the summary of the code for SSE and AVX, respectively:

        for(i=0; i<lim; i+=12) { //tam vale n*n o n*n-12 dependiendo de si n*n es multiplo de 12. 12 ya que 3componentes*4pixeles(4 tamvector)
            vectorR = _mm_set_ps(matrix[i+9], matrix[i+6], matrix[i+3], matrix[i]);
            vectorG = _mm_set_ps(matrix[i+10], matrix[i+7], matrix[i+4], matrix[i+1]);
            vectorB = _mm_set_ps(matrix[i+11], matrix[i+8], matrix[i+5], matrix[i+2]);

            calcular_coeficientes_sse3(Ycoef0, Ycoef1, Ycoef2, Ucoef0, Vcoef0, vectorR, vectorG, vectorB,
                                    values0, values255, values128, &vectorY, &vectorU, &vectorV);

            _mm_store_ps(&y_aux[0], vectorY);
            _mm_store_ps(&u_aux[0], vectorU);
            _mm_store_ps(&v_aux[0], vectorV);

            //Colocamos los datos en la matriz
            //PIXEL 1
            matrix[i] = y_aux[0];
            matrix[i+1]=u_aux[0];
            matrix[i+2]=v_aux[0];
            //PIXEL 2
            matrix[i+3]=y_aux[1];
            matrix[i+4]=u_aux[1];
            matrix[i+5]=v_aux[1];
            //PIXEL 3
            matrix[i+6] = y_aux[2];;
            matrix[i+7]=u_aux[2];
            matrix[i+8]=v_aux[2];
            //PIXEL 4
            matrix[i+9]=y_aux[3];
            matrix[i+10]=u_aux[3];
            matrix[i+11]=v_aux[3];
    }

 for(i=0; i<lim; i+=24) { //Vamos de 8 en 8 pixeles
            vectorR = _mm256_set_ps(matrix[i+21], matrix[i+18], matrix[i+15] ,matrix[i+12],matrix[i+9], matrix[i+6], matrix[i+3], matrix[i]);
            vectorG = _mm256_set_ps(matrix[i+22], matrix[i+19], matrix[i+16], matrix[i+13], matrix[i+10], matrix[i+7], matrix[i+4], matrix[i+1]);
            vectorB = _mm256_set_ps(matrix[i+23], matrix[i+20], matrix[i+17], matrix[i+14], matrix[i+11], matrix[i+8], matrix[i+5], matrix[i+2]);

            calcular_coeficientes_avx(Ycoef0, Ycoef1, Ycoef2, Ucoef0, Vcoef0, vectorR, vectorG, vectorB,
                                    values0, values255, values128, &vectorY, &vectorU, &vectorV);

            _mm256_store_ps(&y_aux[0], vectorY);
            _mm256_store_ps(&u_aux[0], vectorU);
            _mm256_store_ps(&v_aux[0], vectorV);

            //Colocamos los datos en la matriz
            //PIXEL 1
            matrix[i] = y_aux[0];
            matrix[i+1]=u_aux[0];
            matrix[i+2]=v_aux[0];
            //PIXEL 2
            matrix[i+3]=y_aux[1];
            matrix[i+4]=u_aux[1];
            matrix[i+5]=v_aux[1];
            //PIXEL 3
            matrix[i+6] = y_aux[2];;
            matrix[i+7]=u_aux[2];
            matrix[i+8]=v_aux[2];
            //PIXEL 4
            matrix[i+9]=y_aux[3];
            matrix[i+10]=u_aux[3];
            matrix[i+11]=v_aux[3];
            //PIXEL 5
            matrix[i+12]=y_aux[4];
            matrix[i+13]=u_aux[4];
            matrix[i+14]=v_aux[4];
            //PIXEL 6
            matrix[i+15]=y_aux[5];
            matrix[i+16]=u_aux[5];
            matrix[i+17]=v_aux[5];
            //PIXEL 7
            matrix[i+18]=y_aux[6];
            matrix[i+19]=u_aux[6];
            matrix[i+20]=v_aux[6];
            //PIXEL 8
            matrix[i+21]=y_aux[7];
            matrix[i+22]=u_aux[7];
            matrix[i+23]=v_aux[7];
    }

void calcular_coeficientes_sse3(__m128 Ycoef0, __m128 Ycoef1, __m128 Ycoef2, __m128 Ucoef0, __m128 Vcoef0, __m128 vectorR,
                            __m128 vectorG, __m128 vectorB, __m128 values0, __m128 values255, __m128 values128,
                                     __m128 *vectorY, __m128 *vectorU, __m128 *vectorV) {

    //CALCULO DE Y3, Y2, Y1, Y0 (Cuatro píxeles consecutivos)
                                                    //PRIMERA VUELta
    __m128 valores1 = _mm_mul_ps(Ycoef0, vectorR);  // valores1 = (0.299*R[3], 0.299*R[2], 0.299*R[1], 0.299*R[0])
    __m128 valores2 = _mm_mul_ps(Ycoef1, vectorG);  // valores2 = (0.587*G[3], 0.587*G[2], 0.587*G[1], 0.587*G[0])
    __m128 valores3 = _mm_mul_ps(Ycoef2, vectorB);  // valores3 = (0.114*B[3], 0.114*B[2], 0.114*B[1], 0.114*B[0]);
    valores1 = _mm_add_ps(valores1, valores2);      // valores1 = (0.299*R[3] + 0.587*G[3], 0.299*R[2] + 0.587*G[2], 0.299*G[1]+ ..., ...)
    *vectorY = _mm_add_ps(valores1, valores3);      // vectorY = (Y[3], Y[2], Y[1], Y[0])
    *vectorY = _mm_floor_ps(*vectorY);
    //Calculo de U3, U2, U1, U0
    //B-Y
    valores1 = _mm_sub_ps(vectorB, *vectorY);      // valores1 = (B[3]-Y[3], B[2]-Y[2], B[1]-Y[1], B[0]-Y[0])
    valores1 = _mm_mul_ps(Ucoef0, valores1);       // valores1 = (0.492*(B[3]-Y[3]), 0.492*(B[2]-Y[2]), 0.492*(B[1]-Y[1]), 0.492*(...)) 
    *vectorU = _mm_add_ps(valores1, values128);    // vectorU  = (U[3], U[2], U[1], U[0])
    //CALCULO DE V3, V2, V1, V0
    // R-Y
    valores1 = _mm_sub_ps(vectorR, *vectorY);      // valores1 = (R[3]-Y[3], R[2]-Y[2], R[1]-Y[1], R[0]-Y[0])
    valores1 = _mm_mul_ps(Vcoef0, valores1);       // valores1 = (0.877*(R[3]-Y[3]), 0.877*(R[2]-Y[2]), 0.877*(R[1]-Y[1]), 0.877*(...))
    valores1 = _mm_add_ps(valores1, values128);    // valores1 = (0.877*(R[3]-Y[3]) + 128, 0.877*(R[2]-Y[2]) + 128, ..., ...)
    //valores1 pueden necesitar saturacion. 

    //SATURACIONES a 0
    //Para evitar hacer comparaciones cogemos el mayor entre 0 y el valor V[i]:
                                                            // Si V[i] > 0 se queda con V[i] pues es mayor que 0.
                                                            // Si V[i] < 0 se queda con 0 pues es mayor que un número negativo.
    valores1 = _mm_max_ps(valores1, values0);      // valores1 = (max(0.877*(R[3]-Y[3]) + 128,0), ..., ..., ...)

    // SATURACIONES a 255
    //Para evitar hacer comparacion cogemos el menor entre 255 y el valor V[i]
                                                            // Si V[i] < 255 entonces se queda con el menor, V[i]
                                                            // Si V[i] > 255 entonces se queda con el menor, 255.
    *vectorV = _mm_min_ps(valores1, values255);  //vectorV = (V[3], V[2], V[1], V[0])

    //NOTA: Al estar las operaciones implementadas en hardware se hacen las operaciones max y min en un 1 ciclo. 
    //por lo que solo en dos ciclos comprobamos la saturacion de 4 valores V.

    return; //El procedimiento termina devolviendo vectorY, vectorU y vectorV

}

void calcular_coeficientes_avx(__m256 Ycoef0, __m256 Ycoef1, __m256 Ycoef2, __m256 Ucoef0, __m256 Vcoef0, __m256 vectorR,
                            __m256 vectorG, __m256 vectorB, __m256 values0, __m256 values255, __m256 values128,
                                     __m256 *vectorY, __m256 *vectorU, __m256 *vectorV) {

    //CALCULO DE Y7, Y6, Y5, Y4, Y3, Y2, Y1, Y0 (Cuatro píxeles consecutivos)

    __m256 valores1 = _mm256_mul_ps(Ycoef0, vectorR);
    __m256 valores2 = _mm256_mul_ps(Ycoef1, vectorG);
    __m256 valores3 = _mm256_mul_ps(Ycoef2, vectorB);
    valores1 = _mm256_add_ps(valores1, valores2);
    *vectorY = _mm256_add_ps(valores1, valores3);
    *vectorY = _mm256_floor_ps(*vectorY);
    //Calculo de U7, U6, U5, U4, U3, U2, U1, U0
    valores1 = _mm256_sub_ps(vectorB, *vectorY);
    valores1 = _mm256_mul_ps(Ucoef0, valores1);
    *vectorU = _mm256_add_ps(valores1, values128);
    //CALCULO DE V7, V6, V5, V4, V3, V2, V1, V0
    // R-Y
    valores1 = _mm256_sub_ps(vectorR, *vectorY);
    valores1 = _mm256_mul_ps(Vcoef0, valores1);
    valores1 = _mm256_add_ps(valores1, values128);
    //valores1 pueden necesitar saturacion. 

    //SATURACIONES a 0
    //Para evitar hacer comparaciones cogemos el mayor entre 0 y el valor V[i]:


    valores1 = _mm256_max_ps(valores1, values0);

    // SATURACIONES a 255
    //Para evitar hacer comparacion cogemos el menor entre 255 y el valor V[i]
                                                            // Si V[i] < 255 entonces se queda con el menor, V[i]
                                                            // Si V[i] > 255 entonces se queda con el menor, 255.
    *vectorV = _mm256_min_ps(valores1, values255);  //vectorV = (V[3], V[2], V[1], V[0])

    //NOTA: Al estar las operaciones implementadas en hardware se hacen las operaciones max y min en un 1 ciclo. 
    //por lo que solo en dos ciclos comprobamos la saturacion de 4 valores V.

    return; //El procedimiento termina devolviendo vectorY, vectorU y vectorV

}

As you can see both sse and avx are the same but the last is extended and use longer size vector. Why is the same execution time?

NOTE: I have tried in two differents computers with AVX support (obviusly) and i have the same problem.

Thank you very much.

  • 2
    Besides bottlenecking on the stores, the work is done in `calcular_coeficientes_avx` and `calcular_coeficientes_sse` yet those functions are missing. Also, the AVX512 tag may be misleading. – Margaret Bloom Nov 17 '16 at 15:37
  • 2
    `_mm256_set_ps(matrix[i+21], matrix[i+18], ...` please don't do this. Not in SSE either. Every time you write something like that, a puppy dies. Do a normal load, then shuffle. – harold Nov 17 '16 at 16:02
  • 1
    *I have tried in two differents computers*: Different how? Were they both Intel SnB-family, or both AMD? Even *if* your code was well-written, AMD CPUs often don't get much benefit from using 256b AVX. See http://agner.org/optimize/, and other links in the [x86 tag wiki](http://stackoverflow.com/tags/x86/info). – Peter Cordes Nov 18 '16 at 08:12
  • 1
    @harold I've always wondered whether it would've been better if Intel designed those multi-set intrinsics to require compile-time constants. I've never had a legitimate use for the 4+ setters. And even for 2, there's only been one case involving a 2 x 128 -> 256 on a merged load+transpose. – Mysticial Nov 18 '16 at 18:51
  • @PeterCordes Both test were in a intel cpu. More exactly the i-family, so it is not the problem. Thanks for your reply – Paco Muñoz Martinez Nov 18 '16 at 22:02
  • @MargaretBloom you are right. I have included the functions in the post. Please review that.. – Paco Muñoz Martinez Nov 18 '16 at 22:03

1 Answers1

2

Using floating point arithmetic for RGB to YUV is massive overkill, so I'll use fixed point arithmetic here. The output format is very annoying, there is probably a smarter way to shuffle the stuff into place than I used but no matter what you do it will also be annoying again when you want to do anything with the YUV data. It would make more sense to use a format such as [YYYYYYYY UUUUUUUU VVVVVVVV].

While there are many RGB to YUV converters already available, I gave it an other try anyway. It can't hurt to have more alternatives available, and most of them don't quite match - for example doing chroma sub-sampling and usually using an other output format.

I haven't tested it (other than checking that it compiles), so any comments about correctness or performance would be appreciated.

void toYUV(uint8_t* pixels, int size)
{
  // r g b r  g b r g  b r g b  r g b r | g b r g  b r g b

  short rw = 9798; // (short)round(0.299 * 2^15)
  short gw = 19234;// (short)round(0.587 * 2^15)
  short bw = 3736; // (short)round(0.114 * 2^15)
  char _ = -1;

  __m128i z = _mm_setzero_si128();

  for (int i = 0; i < size; i += 24)
  {
    __m128i p0 = _mm_loadu_si128((__m128i*)(pixels + i));
    __m128i p1 = _mm_loadl_epi64((__m128i*)(pixels + i + 16));

    // widen
    // w0 = r0 g0 b0 r1  g1 b1 r2 g2
    __m128i w0 = _mm_unpacklo_epi8(p0, z);
    // w1 = b2 r3 g3 b3  r4 g4 b4 r5
    __m128i w1 = _mm_unpackhi_epi8(p0, z);
    // w2 = g5 b5 r6 g6  b6 r7 g7 b7
    __m128i w2 = _mm_unpacklo_epi8(p1, z);

    __m128i zRGBRGBz = _mm_setr_epi16(0, rw, gw, bw, rw, gw, bw, 0);
    // calculate Y
    // 0+r0 g0+b0  r1+g1 b1+0
    __m128i s0 = _mm_madd_epi16(_mm_bslli_si128(w0, 2), zRGBRGBz);
    // 0+r2 g2+b2  r3+g3 b3+0
    __m128i s1 = _mm_madd_epi16(_mm_alignr_epi8(w1, w0, 10), zRGBRGBz);
    // 0+r4 g4+b4  r5+g5 b5+0
    __m128i s2 = _mm_madd_epi16(_mm_alignr_epi8(w2, w1, 6), zRGBRGBz);
    // 0+r6 g6+b6  r7+g7 b7+0
    __m128i s3 = _mm_madd_epi16(_mm_bsrli_si128(w2, 2), zRGBRGBz);
    // the math works out to make the Y's in [0 .. 254] after scaling back, so
    // y0 = y0 0 y1 0 y2 0 y3 0
    // y1 = y4 0 y5 0 y6 0 y7 0
    __m128i y0 = _mm_srli_epi32(_mm_hadd_epi32(s0, s1), 15);
    __m128i y1 = _mm_srli_epi32(_mm_hadd_epi32(s2, s3), 15);
    __m128i y = _mm_packus_epi32(y0, y1);
    y = _mm_packus_epi16(y, y);

    // calculate U
    // todo: do smarter shuffles?
    // b0 = 0 b0 0 b1 0 b2 0 b3
    __m128i b0 = _mm_or_si128(
      _mm_shuffle_epi8(w0, _mm_setr_epi8(_, _, 4, 5, _, _, 10, 11, _, _, _, _, _, _, _, _)),
      _mm_shuffle_epi8(w1, _mm_setr_epi8(_, _, _, _, _, _,  _,  _, _, _, 0, 1, _, _, 6, 7)));
    // b1 = 0 b4 0 b5 0 b6 0 b7
    __m128i b1 = _mm_or_si128(
      _mm_shuffle_epi8(w1, _mm_setr_epi8(_, _, 13, 14, _, _, _, _, _, _, _, _, _, _,  _, _)),
      _mm_shuffle_epi8(w2, _mm_setr_epi8(_, _,  _,  _, _, _, 2, 3, _, _, 8, 9, _, _, 14, 15)));
    // b - y in [-225 .. 226]
    // u = 18492 * (b - y) >> 15 -> (18492 * b + -18492 * y) >> 15
    // .. so u in [-128 .. 127]
    short us = 18492;
    __m128i u0 = _mm_madd_epi16(_mm_or_si128(b0, y0), _mm_setr_epi16(-us, us, -us, us, -us, us, -us, us));
    u0 = _mm_srai_epi32(u0, 15);
    __m128i u1 = _mm_madd_epi16(_mm_or_si128(b1, y1), _mm_setr_epi16(-us, us, -us, us, -us, us, -us, us));
    u1 = _mm_srai_epi32(u1, 15);
    // pack to sbytes
    __m128i u = _mm_packs_epi32(u0, u1);
    u = _mm_packs_epi16(u, u);

    // calculate V
    // todo: do smarter shuffles?
    // r0 = 0 r0 0 r1 0 r2 0 r3
    __m128i r0 = _mm_or_si128(
      _mm_shuffle_epi8(w0, _mm_setr_epi8(_, _, 0, 1, _, _, 6, 7, _, _, 12, 13, _, _, _, _)),
      _mm_shuffle_epi8(w1, _mm_setr_epi8(_, _, _, _, _, _, _, _, _, _,  _,  _, _, _, 2, 3)));
    // r1 = 0 r4 0 r5 0 r6 0 r7
    __m128i r1 = _mm_or_si128(
      _mm_shuffle_epi8(w1, _mm_setr_epi8(_, _, 8, 9, _, _, 14, 15, _, _, _, _, _, _,  _, _)),
      _mm_shuffle_epi8(w2, _mm_setr_epi8(_, _, _, _, _, _,  _,  _, _, _, 4, 5, _, _, 10, 11)));
    // r - y in [-178 .. 179]
    // v = 23372 * (r - y) >> 15 -> (23372 * r + -23372 * y) >> 15
    // .. so v in [-128 .. 127]
    short vs = 23372;
    __m128i v0 = _mm_madd_epi16(_mm_or_si128(r0, y0), _mm_setr_epi16(-vs, vs, -vs, vs, -vs, vs, -vs, vs));
    v0 = _mm_srai_epi32(v0, 15);
    __m128i v1 = _mm_madd_epi16(_mm_or_si128(r1, y1), _mm_setr_epi16(-vs, vs, -vs, vs, -vs, vs, -vs, vs));
    v1 = _mm_srai_epi32(v1, 15);
    // pack to sbytes
    __m128i v = _mm_packs_epi32(v0, v1);
    v = _mm_packs_epi16(v, v);

    // interleave :(
    // y0 u0 v0 y1  u1 v1 y2 u2  v2 y3 u3 v3  y4 u4 v4 y5
    // u5 v5 y6 u6  v6 y7 u7 v7
    __m128i shuf0 = _mm_setr_epi8(0, _, _, 1, _, _, 2, _, _, 3, _, _, 4, _, _, 5);
    p0 = _mm_or_si128(_mm_or_si128(
      _mm_shuffle_epi8(y, shuf0),
      _mm_shuffle_epi8(u, _mm_bslli_si128(shuf0, 1))),
      _mm_shuffle_epi8(v, _mm_bslli_si128(shuf0, 2)));
    __m128i shuf1 = _mm_setr_epi8(_, 5, _, _, 6, _, _, 7, _, _, _, _, _, _, _, _);
    p1 = _mm_or_si128(_mm_or_si128(
      _mm_shuffle_epi8(y, _mm_srli_si128(shuf1, 2)),
      _mm_shuffle_epi8(u, _mm_srli_si128(shuf1, 1))),
      _mm_shuffle_epi8(v, shuf1));

    _mm_storeu_si128((__m128i*)(pixels + i), p0);
    _mm_storel_epi64((__m128i*)(pixels + i + 16), p1);
  }
}

This isn't great, it's far too p5-heavy with all those shuffles and (un)packs. On Ivy that's not such a problem. Using a less interleaved output format would help a lot there, saving 6 shuffles. Of course since this is fixed point, you can't really convert this to AVX, but you could try AVX2.

Here's a "more vertical" version, based on pmulhrsw instead of pmaddwd, also not tested (it's probably broken). The asm looks better, but I haven't tested performance either.

void toYUV(uint8_t* pixels, int size)
{
  // r g b r  g b r g  b r g b  r g b r | g b r g  b r g b
  char _ = -1;
  __m128i rshuf0 = _mm_setr_epi8(0, _, 3, _, 6, _,  9, _, 12, _, 15, _, _, _, _, _);
  __m128i rshuf1 = _mm_setr_epi8(_, _, _, _, _, _,  _, _,  _, _,  _, _, 2, _, 5, _);
  __m128i gshuf0 = _mm_setr_epi8(1, _, 4, _, 7, _, 10, _, 13, _,  _, _, _, _, _, _);
  __m128i gshuf1 = _mm_setr_epi8(_, _, _, _, _, _,  _, _,  _, _,  0, _, 3, _, 6, _);
  __m128i bshuf0 = _mm_setr_epi8(2, _, 5, _, 8, _, 11, _, 14, _,  _, _, _, _, _, _);
  __m128i bshuf1 = _mm_setr_epi8(_, _, _, _, _, _,  _, _,  _, _,  1, _, 2, _, 5, _);

  __m128i ryweight = _mm_set1_epi16(9798); // (short)round(0.299 * 2^15)
  __m128i gyweight = _mm_set1_epi16(19235);// (short)round(0.587 * 2^15)
  __m128i byweight = _mm_set1_epi16(3736); // (short)round(0.114 * 2^15)
  __m128i uscale  = _mm_set1_epi16(18492); // (short)floor(2^15 / (2^15 - byweight) * 0.5 * 2^15)
  __m128i uofs = _mm_set1_epi16(128);
  __m128i vscale  = _mm_set1_epi16(23372); // (short)floor(2^15 / (2^15 - ryweight) * 0.5 * 2^15)
  __m128i vofs = _mm_set1_epi16(128);

  __m128i yshuf0 = _mm_setr_epi8(0, _, _, 2, _, _, 4, _, _, 6, _, _, 8, _, _, _);
  __m128i yshuf1 = _mm_setr_epi8(10, _, _, 12, _, _, 14, _, _, _, _, _, _, _, _, _);
  __m128i ushuf0 = _mm_setr_epi8(_, 0, _, _, 1, _, _, 2, _, _, 3, _, _, 4, _, _);
  __m128i ushuf1 = _mm_setr_epi8(5, _, _, 6, _, _, 7, _, _, _, _, _, _, _, _, _);

  for (int i = 0; i < size; i += 24)
  {
    __m128i p0 = _mm_loadu_si128((__m128i*)(pixels + i));
    __m128i p1 = _mm_loadl_epi64((__m128i*)(pixels + i + 16));

    __m128i r = _mm_or_si128(_mm_shuffle_epi8(p0, rshuf0), _mm_shuffle_epi8(p1, rshuf1));
    __m128i g = _mm_or_si128(_mm_shuffle_epi8(p0, gshuf0), _mm_shuffle_epi8(p1, gshuf1));
    __m128i b = _mm_or_si128(_mm_shuffle_epi8(p0, bshuf0), _mm_shuffle_epi8(p1, bshuf1));

    __m128i scaledr = _mm_mulhrs_epi16(r, ryweight);
    __m128i scaledg = _mm_mulhrs_epi16(g, gyweight);
    __m128i scaledb = _mm_mulhrs_epi16(b, byweight);
    __m128i y = _mm_add_epi16(_mm_add_epi16(scaledr, scaledg), scaledb);
    __m128i u = _mm_mulhrs_epi16(_mm_sub_epi16(b, y), uscale);
    __m128i v = _mm_mulhrs_epi16(_mm_sub_epi16(r, y), vscale);
    // pack back to bytes and shuffle
    // words in y are guaranteed to be in [0 - 255] so just shuffle them into place
    // swords in u and v may be slightly wonky so pack with saturation first
    u = _mm_packs_epi16(u, u);
    v = _mm_packs_epi16(v, v);
    p0 = _mm_or_si128(_mm_or_si128(
      _mm_shuffle_epi8(y, yshuf0),
      _mm_shuffle_epi8(u, ushuf0)),
      _mm_shuffle_epi8(v, _mm_bslli_si128(ushuf0, 1)));
    p1 = _mm_or_si128(_mm_or_si128(
      _mm_shuffle_epi8(y, yshuf1),
      _mm_shuffle_epi8(u, ushuf1)),
      _mm_shuffle_epi8(v, _mm_bslli_si128(ushuf1, 1)));

    _mm_storeu_si128((__m128i*)(pixels + i), p0);
    _mm_storel_epi64((__m128i*)(pixels + i + 16), p1);
  }
}
harold
  • 61,398
  • 6
  • 86
  • 164