3

In my program I have a big array of 32-bit integers. I have to do the following operation on it:

sum = array[i] + array[i+1] + array[i+2]
array[i] = sum
array[i+1] = sum
array[i+2] = sum
i+=3

Or, as I've written it in assembly:

loop: ;R12 - address of the array, R11 - last element, R10 - iterator

mov eax, [R12 + R10]
add eax, [R12 + R10 + 4]
add eax, [R12 + R10 + 8]

mov [R12 + R10], eax
mov [R12 + R10 + 4], eax
mov [R12 + R10 + 8], eax

mov rax, 0
mov rdx, 0

add R10, 12
cmp R10, R11
jb loop

Is it possible to do that using vector instructions? If so, how?

Paul R
  • 208,748
  • 37
  • 389
  • 560
Matjag
  • 91
  • 7
  • 1
    Quite easy to do with with either SSE or AVX2. AVX has only floating point arithmetic instructions though. I suggest you use intrinsics rather than assembly (quicker. easier, less error-prone, more portable). – Paul R Jan 21 '19 at 10:31
  • Note that [a decent compiler will vectorize this for you](https://godbolt.org/z/aTAtGj), using either [SSE](https://godbolt.org/z/aTAtGj) or [AVX2](https://godbolt.org/z/CV3gDb). – Paul R Jan 21 '19 at 11:15
  • @PaulR: For integer, AVX1 adds the efficiency of 3-operand non-destructive operations instead of e.g. `vpalignr xmm0, xmm1, xmm2, 4` instead of `movdqa xmm0, xmm1` / `palignr xmm0, xmm2, 4`, and unaligned memory source operands without having to use a separate `movdqu` into a tmp register. Both helpful if you're bottlenecked on front-end throughput. – Peter Cordes Jan 22 '19 at 02:12
  • 1
    @matjag: you're not using RDX at all, and `mov rax, 0` is totally pointless because the first load of every iteration (`mov eax, [r12 + r10]`) breaks any dependency on the old value of RAX. – Peter Cordes Jan 22 '19 at 02:39

2 Answers2

5

The compiler can do the vectorization for you, but vectorization with intrinsics might lead to more efficient code. Function sum3neighb below sums 3 neighbouring elements of an array with 12 integer elements. Instead of using many shuffles, it uses overlapping loads to get the data at the right position.

/*  gcc -O3 -Wall -march=sandybridge -m64 neighb3.c                */
#include <stdio.h>
#include <immintrin.h>

inline __m128i _mm_shufps_epi32(__m128i a, __m128i b,int imm){
    return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a),_mm_castsi128_ps(b),imm));
}

/* For an integer array of 12 elements, sum every 3 neighbouring elements */
void sum3neighb(int * a){
    __m128i a_3210 = _mm_loadu_si128((__m128i*)&a[0]);
    __m128i a_9876 = _mm_loadu_si128((__m128i*)&a[6]);
    __m128i a_9630 = _mm_shufps_epi32(a_3210, a_9876, 0b11001100);

    __m128i a_4321 = _mm_loadu_si128((__m128i*)&a[1]);
    __m128i a_A987 = _mm_loadu_si128((__m128i*)&a[7]);
    __m128i a_A741 = _mm_shufps_epi32(a_4321, a_A987, 0b11001100);

    __m128i a_5432 = _mm_loadu_si128((__m128i*)&a[2]);
    __m128i a_BA98 = _mm_loadu_si128((__m128i*)&a[8]);
    __m128i a_B852 = _mm_shufps_epi32(a_5432, a_BA98, 0b11001100);

    __m128i sum = _mm_add_epi32(a_9630, a_A741);
            sum = _mm_add_epi32(sum, a_B852);    /* B+A+9, 8+7+6, 5+4+3, 2+1+0 */

    __m128i sum_3210 = _mm_shuffle_epi32(sum, 0b01000000);
    __m128i sum_7654 = _mm_shuffle_epi32(sum, 0b10100101);
    __m128i sum_BA98 = _mm_shuffle_epi32(sum, 0b11111110);

            _mm_storeu_si128((__m128i*)&a[0], sum_3210);
            _mm_storeu_si128((__m128i*)&a[4], sum_7654);
            _mm_storeu_si128((__m128i*)&a[8], sum_BA98);
}


int main(){
    int i;
    int a[24];  
    for (i = 0; i < 24; i++) a[i] = i + 4;  /* example input */
    for (i = 0; i < 24; i++){ printf("%3i  ",a[i]);}
    printf("\n");
    for (i = 0; i < 24; i = i + 12){
       sum3neighb(&a[i]);
    }
    for (i = 0; i < 24; i++){ printf("%3i  ",a[i]);}
    printf("\n");
    return 0;
}

This compiles to the following assembly (with gcc 8.2):

sum3neighb:
  vmovups xmm4, XMMWORD PTR [rdi+4]
  vshufps xmm2, xmm4, XMMWORD PTR [rdi+28], 204
  vmovups xmm3, XMMWORD PTR [rdi]
  vshufps xmm0, xmm3, XMMWORD PTR [rdi+24], 204
  vpaddd xmm0, xmm0, xmm2
  vmovups xmm5, XMMWORD PTR [rdi+8]
  vshufps xmm1, xmm5, XMMWORD PTR [rdi+32], 204
  vpaddd xmm0, xmm0, xmm1
  vpshufd xmm2, xmm0, 64
  vpshufd xmm1, xmm0, 165
  vmovups XMMWORD PTR [rdi], xmm2
  vpshufd xmm0, xmm0, 254
  vmovups XMMWORD PTR [rdi+16], xmm1
  vmovups XMMWORD PTR [rdi+32], xmm0
  ret

The output of the example program is: (First row is input, second row is output, the rows are truncated.)

 4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   ...   
15   15   15   24   24   24   33   33   33   42   42   42   51   51   51   60   ...

clang doesn't accept the _mm_shufps_epi32 function, see Peter's comment. There are two alternatives: A template function (see chtz's comment, Godbolt link)

template<int imm>
inline  __m128i _mm_shufps_epi32(__m128i a, __m128i b){
    return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a),_mm_castsi128_ps(b),imm));
}

Or a macro:

#define _mm_shufps_epi32(a,b,i) _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a),_mm_castsi128_ps(b),i)) 

On newer Intel architectures (since Haswell), integer vector addition instructions are faster than shuffle instructions, see Agner Fog's instruction tables. In that case the following code might be slightly more efficient. It takes 2 additions more, but also 2 shuffles less:

void sum3neighb_v3(int * a){
    __m128i a_3210 = _mm_loadu_si128((__m128i*)&a[0]);
    __m128i a_4321 = _mm_loadu_si128((__m128i*)&a[1]);
    __m128i a_5432 = _mm_loadu_si128((__m128i*)&a[2]);
    __m128i sum53_20 = _mm_add_epi32(a_3210, a_5432);
    __m128i sum543_210 = _mm_add_epi32(sum53_20, a_4321);

    __m128i a_9876 = _mm_loadu_si128((__m128i*)&a[6]);
    __m128i a_A987 = _mm_loadu_si128((__m128i*)&a[7]);
    __m128i a_BA98 = _mm_loadu_si128((__m128i*)&a[8]);
    __m128i sumB9_86 = _mm_add_epi32(a_9876, a_BA98);
    __m128i sumBA9_876 = _mm_add_epi32(sumB9_86, a_A987
    );        
    __m128i sum = _mm_shufps_epi32(sum543_210, sumBA9_876, 0b11001100);

    __m128i sum_3210 = _mm_shuffle_epi32(sum, 0b01000000);
    __m128i sum_7654 = _mm_shuffle_epi32(sum, 0b10100101);
    __m128i sum_BA98 = _mm_shuffle_epi32(sum, 0b11111110);

            _mm_storeu_si128((__m128i*)&a[0], sum_3210);
            _mm_storeu_si128((__m128i*)&a[4], sum_7654);
            _mm_storeu_si128((__m128i*)&a[8], sum_BA98);
}

AVX2 version

The AVX2 version, see code below, uses lane crossing shuffles and is therefore less suitable for AMD processors, see also chtz's answer.

void sum3neighb_avx2(int * a){
    __m256i a_0  = _mm256_loadu_si256((__m256i*)&a[0]);
    __m256i a_1  = _mm256_loadu_si256((__m256i*)&a[1]);
    __m256i a_2  = _mm256_loadu_si256((__m256i*)&a[2]);

    __m256i a_8  = _mm256_loadu_si256((__m256i*)&a[8]);
    __m256i a_9  = _mm256_loadu_si256((__m256i*)&a[9]);
    __m256i a_10 = _mm256_loadu_si256((__m256i*)&a[10]);

    __m256i a_16 = _mm256_loadu_si256((__m256i*)&a[16]);
    __m256i a_17 = _mm256_loadu_si256((__m256i*)&a[17]);
    __m256i a_18 = _mm256_loadu_si256((__m256i*)&a[18]);

    __m256i sum_0  = _mm256_add_epi32(_mm256_add_epi32(a_0,  a_1),  a_2);
    __m256i sum_8  = _mm256_add_epi32(_mm256_add_epi32(a_8,  a_9),  a_10);
    __m256i sum_16 = _mm256_add_epi32(_mm256_add_epi32(a_16, a_17), a_18);

    __m256i sum_8_0 = _mm256_blend_epi32(sum_0,  sum_8, 0b10010010);
    __m256i sum     = _mm256_blend_epi32(sum_8_0, sum_16, 0b00100100);

    __m256i sum_7_0   = _mm256_permutevar8x32_epi32(sum, _mm256_set_epi32(6,6,3,3,3,0,0,0));
    __m256i sum_15_8  = _mm256_permutevar8x32_epi32(sum, _mm256_set_epi32(7,4,4,4,1,1,1,6));
    __m256i sum_23_16 = _mm256_permutevar8x32_epi32(sum, _mm256_set_epi32(5,5,5,2,2,2,7,7));

            _mm256_storeu_si256((__m256i*)&a[0],  sum_7_0  );
            _mm256_storeu_si256((__m256i*)&a[8],  sum_15_8 );
            _mm256_storeu_si256((__m256i*)&a[16], sum_23_16);
}
wim
  • 3,702
  • 19
  • 23
  • 2
    unfortunately clang doesn't support writing wrapper functions for `shufps` because it checks for compile-time-constness before inlining, even with optimization enabled. You can make it a `#define` macro, though. Anyway, clang's strategy for auto-vectorization is manual gathers with `vpinsrd` to create the vectors it will need for the output :/ Interestingly different from gcc. https://godbolt.org/z/Gox1P3 It compiles your manually-vectorized version more or less as you wrote it, so its shuffle optimizer didn't find anything. – Peter Cordes Jan 22 '19 at 02:30
  • 1
    I wonder if overlapping stores would be a good idea, to save some of the shuffling between vectors. (i.e. store a vector of `{out0,out1,out2, x}`, then overlap it and rewrite the `x`.) 2x load + `palignr` could feed 2x paddd + movups. Hmm, maybe not better than what you're doing. If targeting a non-redundant output format (each result once instead of repeated 3 times), `phaddd` might be useful, but even then I'm not sure for a multiple of 3 grouping. – Peter Cordes Jan 22 '19 at 02:36
  • 2
    @PeterCordes Instead of a macro, I'd generally prefer a template function https://godbolt.org/z/pmqlEr. This does lead to a nonuniform syntax, though (unless you wrap all other shuffles as well). – chtz Jan 22 '19 at 10:04
  • @PeterCordes: Remarkably both clang and gcc use 13 shuffle operations (vpalignr, vpshufd etc.). It is possible to do some additions immediately after the overlapping loads. That reduces the number of shuffles, but increase the number of adds, which is good on Haswell and newer. In this particular case (overlapping) broadcasting stores would be nice. Obviously they don't exist. – wim Jan 22 '19 at 12:20
2

In case someone is looking for an AVX2 variant, here is a version based on https://stackoverflow.com/a/45025712 (which itself is based on an article by Intel):

#include <immintrin.h> 

template<int imm>
inline __m256i _mm256_shufps_epi32(__m256i a, __m256i b){
    return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b),imm));
}

void sum3neighb24(int * a){
    __m256i a_FEDC_3210 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&a[0])),_mm_loadu_si128((__m128i*)&a[12]),1) ;
    __m256i a_JIHG_7654 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&a[4])),_mm_loadu_si128((__m128i*)&a[16]),1) ;
    __m256i a_NMLK_BA98 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&a[8])),_mm_loadu_si128((__m128i*)&a[20]),1) ;


    __m256i a_MLJI_A976 = _mm256_shufps_epi32<_MM_SHUFFLE( 2,1, 3,2)>(a_JIHG_7654,a_NMLK_BA98);
    __m256i a_HGED_5421 = _mm256_shufps_epi32<_MM_SHUFFLE( 1,0, 2,1)>(a_FEDC_3210,a_JIHG_7654);

    __m256i a_LIFC_9630 = _mm256_shufps_epi32<_MM_SHUFFLE( 2,0, 3,0)>(a_FEDC_3210,a_MLJI_A976);
    __m256i a_MJGD_A741 = _mm256_shufps_epi32<_MM_SHUFFLE( 3,1, 2,0)>(a_HGED_5421,a_MLJI_A976);
    __m256i a_NKHE_B852 = _mm256_shufps_epi32<_MM_SHUFFLE( 3,0, 3,1)>(a_HGED_5421,a_NMLK_BA98);

    __m256i sum = _mm256_add_epi32(a_LIFC_9630, a_MJGD_A741);
            sum = _mm256_add_epi32(sum, a_NKHE_B852);    /* B+A+9, 8+7+6, 5+4+3, 2+1+0 */

    __m256i sum_FEDC_3210 = _mm256_shuffle_epi32(sum, 0b01000000);
    __m256i sum_JIHG_7654 = _mm256_shuffle_epi32(sum, 0b10100101);
    __m256i sum_NMLK_BA98 = _mm256_shuffle_epi32(sum, 0b11111110);

    _mm_storeu_si128((__m128i*)&a[0], _mm256_castsi256_si128(sum_FEDC_3210));
    _mm_storeu_si128((__m128i*)&a[4], _mm256_castsi256_si128(sum_JIHG_7654));
    _mm_storeu_si128((__m128i*)&a[8], _mm256_castsi256_si128(sum_NMLK_BA98));
    _mm_storeu_si128((__m128i*)&a[12], _mm256_extractf128_si256 (sum_FEDC_3210,1));
    _mm_storeu_si128((__m128i*)&a[16], _mm256_extractf128_si256 (sum_JIHG_7654,1));
    _mm_storeu_si128((__m128i*)&a[20], _mm256_extractf128_si256 (sum_NMLK_BA98,1));
}

The back-shuffling is based on @wim's answer. It might actually also be better to also trade more loads for less shuffles at the beginning.

chtz
  • 17,329
  • 4
  • 26
  • 56
  • Could you provide a version that I can copy to https://godbolt.org/ and compile it to assembly? I'm not familiar with intrinsics. – Matjag Jan 29 '19 at 08:13
  • 1
    Just add a `#include ` at the top (and make sure to compile with `-O2`): https://godbolt.org/z/V60PKI – chtz Jan 29 '19 at 08:45