The compiler can do the vectorization for you, but vectorization with intrinsics
might lead to more efficient code. Function sum3neighb
below sums 3 neighbouring
elements of an array with 12 integer elements. Instead of using many shuffles, it uses overlapping loads to get the
data at the right position.
/* gcc -O3 -Wall -march=sandybridge -m64 neighb3.c */
#include <stdio.h>
#include <immintrin.h>
inline __m128i _mm_shufps_epi32(__m128i a, __m128i b,int imm){
return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a),_mm_castsi128_ps(b),imm));
}
/* For an integer array of 12 elements, sum every 3 neighbouring elements */
void sum3neighb(int * a){
__m128i a_3210 = _mm_loadu_si128((__m128i*)&a[0]);
__m128i a_9876 = _mm_loadu_si128((__m128i*)&a[6]);
__m128i a_9630 = _mm_shufps_epi32(a_3210, a_9876, 0b11001100);
__m128i a_4321 = _mm_loadu_si128((__m128i*)&a[1]);
__m128i a_A987 = _mm_loadu_si128((__m128i*)&a[7]);
__m128i a_A741 = _mm_shufps_epi32(a_4321, a_A987, 0b11001100);
__m128i a_5432 = _mm_loadu_si128((__m128i*)&a[2]);
__m128i a_BA98 = _mm_loadu_si128((__m128i*)&a[8]);
__m128i a_B852 = _mm_shufps_epi32(a_5432, a_BA98, 0b11001100);
__m128i sum = _mm_add_epi32(a_9630, a_A741);
sum = _mm_add_epi32(sum, a_B852); /* B+A+9, 8+7+6, 5+4+3, 2+1+0 */
__m128i sum_3210 = _mm_shuffle_epi32(sum, 0b01000000);
__m128i sum_7654 = _mm_shuffle_epi32(sum, 0b10100101);
__m128i sum_BA98 = _mm_shuffle_epi32(sum, 0b11111110);
_mm_storeu_si128((__m128i*)&a[0], sum_3210);
_mm_storeu_si128((__m128i*)&a[4], sum_7654);
_mm_storeu_si128((__m128i*)&a[8], sum_BA98);
}
int main(){
int i;
int a[24];
for (i = 0; i < 24; i++) a[i] = i + 4; /* example input */
for (i = 0; i < 24; i++){ printf("%3i ",a[i]);}
printf("\n");
for (i = 0; i < 24; i = i + 12){
sum3neighb(&a[i]);
}
for (i = 0; i < 24; i++){ printf("%3i ",a[i]);}
printf("\n");
return 0;
}
This compiles to the following assembly (with gcc 8.2):
sum3neighb:
vmovups xmm4, XMMWORD PTR [rdi+4]
vshufps xmm2, xmm4, XMMWORD PTR [rdi+28], 204
vmovups xmm3, XMMWORD PTR [rdi]
vshufps xmm0, xmm3, XMMWORD PTR [rdi+24], 204
vpaddd xmm0, xmm0, xmm2
vmovups xmm5, XMMWORD PTR [rdi+8]
vshufps xmm1, xmm5, XMMWORD PTR [rdi+32], 204
vpaddd xmm0, xmm0, xmm1
vpshufd xmm2, xmm0, 64
vpshufd xmm1, xmm0, 165
vmovups XMMWORD PTR [rdi], xmm2
vpshufd xmm0, xmm0, 254
vmovups XMMWORD PTR [rdi+16], xmm1
vmovups XMMWORD PTR [rdi+32], xmm0
ret
The output of the example program is: (First row is input, second row is output,
the rows are truncated.)
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ...
15 15 15 24 24 24 33 33 33 42 42 42 51 51 51 60 ...
clang doesn't accept the _mm_shufps_epi32
function, see Peter's comment.
There are two alternatives: A template function (see chtz's comment, Godbolt link)
template<int imm>
inline __m128i _mm_shufps_epi32(__m128i a, __m128i b){
return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a),_mm_castsi128_ps(b),imm));
}
Or a macro:
#define _mm_shufps_epi32(a,b,i) _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a),_mm_castsi128_ps(b),i))
On newer Intel architectures (since Haswell), integer vector addition instructions are faster than shuffle instructions, see
Agner Fog's instruction tables. In that case the following code might be slightly more efficient. It takes 2 additions more,
but also 2 shuffles less:
void sum3neighb_v3(int * a){
__m128i a_3210 = _mm_loadu_si128((__m128i*)&a[0]);
__m128i a_4321 = _mm_loadu_si128((__m128i*)&a[1]);
__m128i a_5432 = _mm_loadu_si128((__m128i*)&a[2]);
__m128i sum53_20 = _mm_add_epi32(a_3210, a_5432);
__m128i sum543_210 = _mm_add_epi32(sum53_20, a_4321);
__m128i a_9876 = _mm_loadu_si128((__m128i*)&a[6]);
__m128i a_A987 = _mm_loadu_si128((__m128i*)&a[7]);
__m128i a_BA98 = _mm_loadu_si128((__m128i*)&a[8]);
__m128i sumB9_86 = _mm_add_epi32(a_9876, a_BA98);
__m128i sumBA9_876 = _mm_add_epi32(sumB9_86, a_A987
);
__m128i sum = _mm_shufps_epi32(sum543_210, sumBA9_876, 0b11001100);
__m128i sum_3210 = _mm_shuffle_epi32(sum, 0b01000000);
__m128i sum_7654 = _mm_shuffle_epi32(sum, 0b10100101);
__m128i sum_BA98 = _mm_shuffle_epi32(sum, 0b11111110);
_mm_storeu_si128((__m128i*)&a[0], sum_3210);
_mm_storeu_si128((__m128i*)&a[4], sum_7654);
_mm_storeu_si128((__m128i*)&a[8], sum_BA98);
}
AVX2 version
The AVX2 version, see code below, uses lane crossing shuffles and is therefore less suitable for AMD processors, see also chtz's answer.
void sum3neighb_avx2(int * a){
__m256i a_0 = _mm256_loadu_si256((__m256i*)&a[0]);
__m256i a_1 = _mm256_loadu_si256((__m256i*)&a[1]);
__m256i a_2 = _mm256_loadu_si256((__m256i*)&a[2]);
__m256i a_8 = _mm256_loadu_si256((__m256i*)&a[8]);
__m256i a_9 = _mm256_loadu_si256((__m256i*)&a[9]);
__m256i a_10 = _mm256_loadu_si256((__m256i*)&a[10]);
__m256i a_16 = _mm256_loadu_si256((__m256i*)&a[16]);
__m256i a_17 = _mm256_loadu_si256((__m256i*)&a[17]);
__m256i a_18 = _mm256_loadu_si256((__m256i*)&a[18]);
__m256i sum_0 = _mm256_add_epi32(_mm256_add_epi32(a_0, a_1), a_2);
__m256i sum_8 = _mm256_add_epi32(_mm256_add_epi32(a_8, a_9), a_10);
__m256i sum_16 = _mm256_add_epi32(_mm256_add_epi32(a_16, a_17), a_18);
__m256i sum_8_0 = _mm256_blend_epi32(sum_0, sum_8, 0b10010010);
__m256i sum = _mm256_blend_epi32(sum_8_0, sum_16, 0b00100100);
__m256i sum_7_0 = _mm256_permutevar8x32_epi32(sum, _mm256_set_epi32(6,6,3,3,3,0,0,0));
__m256i sum_15_8 = _mm256_permutevar8x32_epi32(sum, _mm256_set_epi32(7,4,4,4,1,1,1,6));
__m256i sum_23_16 = _mm256_permutevar8x32_epi32(sum, _mm256_set_epi32(5,5,5,2,2,2,7,7));
_mm256_storeu_si256((__m256i*)&a[0], sum_7_0 );
_mm256_storeu_si256((__m256i*)&a[8], sum_15_8 );
_mm256_storeu_si256((__m256i*)&a[16], sum_23_16);
}