The 8-bit integer case.
As already mentioned in the comments above, two input shuffle instructions, such as vshufps
, don't
exist for 8-bit granularity. Hence, the 8-bit solution differs a bit from the 32-bit solution. Two different solutions are described below.
A straightforward approach is to group the 8-bit integers 'color by color (R G B)' with 6 vpblendvb
-s, followed
by a vpshufb
permutation:
#include <stdio.h>
#include <x86intrin.h>
/* gcc -O3 -Wall -m64 -march=broadwell stride_3.c */
int __attribute__ ((noinline)) print_vec_char(__m256i x);
int main() {
char *m;
int i;
__m256i blnd1 = _mm256_set_epi8(0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0, 0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0);
__m256i blnd2 = _mm256_set_epi8(0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0, 0,-1,0,0,-1,0,0,-1,0,0,-1,0,0,-1,0,0);
__m256i p0 = _mm256_set_epi8(13,10,7,4,1, 14,11,8,5,2, 15,12,9,6,3,0, 13,10,7,4,1, 14,11,8,5,2, 15,12,9,6,3,0);
__m256i p1 = _mm256_set_epi8(14,11,8,5,2, 15,12,9,6,3,0, 13,10,7,4,1, 14,11,8,5,2, 15,12,9,6,3,0, 13,10,7,4,1);
__m256i p2 = _mm256_set_epi8(15,12,9,6,3,0, 13,10,7,4,1, 14,11,8,5,2, 15,12,9,6,3,0, 13,10,7,4,1, 14,11,8,5,2);
m = _mm_malloc(96,32);
for(i = 0; i < 96; i++) m[i] = i;
// printf("m_lo ");print_vec_char(_mm256_load_si256((__m256i*)&m[0]));printf("m_mid ");print_vec_char(_mm256_load_si256((__m256i*)&m[32]));printf("m_hi ");print_vec_char(_mm256_load_si256((__m256i*)&m[64]));printf("\n");
// m_lo 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 || 15 14 13 12 | 11 10 9 8 | 7 6 5 4 | 3 2 1 0
// m_mid 63 62 61 60 | 59 58 57 56 | 55 54 53 52 | 51 50 49 48 || 47 46 45 44 | 43 42 41 40 | 39 38 37 36 | 35 34 33 32
// m_hi 95 94 93 92 | 91 90 89 88 | 87 86 85 84 | 83 82 81 80 || 79 78 77 76 | 75 74 73 72 | 71 70 69 68 | 67 66 65 64
__m256i t0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&m[0]));
__m256i t1 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&m[16]));
__m256i t2 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&m[32]));
t0 = _mm256_inserti128_si256(t0,_mm_loadu_si128((__m128i*)&m[48]),1);
t1 = _mm256_inserti128_si256(t1,_mm_loadu_si128((__m128i*)&m[64]),1);
t2 = _mm256_inserti128_si256(t2,_mm_loadu_si128((__m128i*)&m[80]),1);
// printf("t0 ");print_vec_char(t0);printf("t1 ");print_vec_char(t1);printf("t2 ");print_vec_char(t2);printf("\n");
// t0 63 62 61 60 | 59 58 57 56 | 55 54 53 52 | 51 50 49 48 || 15 14 13 12 | 11 10 9 8 | 7 6 5 4 | 3 2 1 0
// t1 79 78 77 76 | 75 74 73 72 | 71 70 69 68 | 67 66 65 64 || 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16
// t2 95 94 93 92 | 91 90 89 88 | 87 86 85 84 | 83 82 81 80 || 47 46 45 44 | 43 42 41 40 | 39 38 37 36 | 35 34 33 32
__m256i u0 = _mm256_blendv_epi8( _mm256_blendv_epi8(t0,t1,blnd2), t2,blnd1);
__m256i u1 = _mm256_blendv_epi8( _mm256_blendv_epi8(t1,t2,blnd2), t0,blnd1);
__m256i u2 = _mm256_blendv_epi8( _mm256_blendv_epi8(t2,t0,blnd2), t1,blnd1);
// printf("u0 ");print_vec_char(u0);printf("u1 ");print_vec_char(u1);printf("u2 ");print_vec_char(u2);printf("\n");
// u0 63 78 93 60 | 75 90 57 72 | 87 54 69 84 | 51 66 81 48 || 15 30 45 12 | 27 42 9 24 | 39 6 21 36 | 3 18 33 0
// u1 79 94 61 76 | 91 58 73 88 | 55 70 85 52 | 67 82 49 64 || 31 46 13 28 | 43 10 25 40 | 7 22 37 4 | 19 34 1 16
// u2 95 62 77 92 | 59 74 89 56 | 71 86 53 68 | 83 50 65 80 || 47 14 29 44 | 11 26 41 8 | 23 38 5 20 | 35 2 17 32
t0 = _mm256_shuffle_epi8(u0,p0);
t1 = _mm256_shuffle_epi8(u1,p1);
t2 = _mm256_shuffle_epi8(u2,p2);
printf("t0 ");print_vec_char(t0);printf("t1 ");print_vec_char(t1);printf("t2 ");print_vec_char(t2);printf("\n");
// t0 93 90 87 84 | 81 78 75 72 | 69 66 63 60 | 57 54 51 48 || 45 42 39 36 | 33 30 27 24 | 21 18 15 12 | 9 6 3 0
// t1 94 91 88 85 | 82 79 76 73 | 70 67 64 61 | 58 55 52 49 || 46 43 40 37 | 34 31 28 25 | 22 19 16 13 | 10 7 4 1
// t2 95 92 89 86 | 83 80 77 74 | 71 68 65 62 | 59 56 53 50 || 47 44 41 38 | 35 32 29 26 | 23 20 17 14 | 11 8 5 2
return 0;
}
int __attribute__ ((noinline)) print_vec_char(__m256i x){
char v[32];
_mm256_storeu_si256((__m256i *)v,x);
printf("%3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi || ",
v[31],v[30],v[29],v[28],v[27],v[26],v[25],v[24],v[23],v[22],v[21],v[20],v[19],v[18],v[17],v[16]);
printf("%3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi \n",
v[15],v[14],v[13],v[12],v[11],v[10],v[9],v[8],v[7],v[6],v[5],v[4],v[3],v[2],v[1],v[0]);
return 0;
}
Instruction summary:
3 vmovdqu
3 vinserti128-load
6 vpblendvb
3 vpshufb
Unfortunately, the vpblendvb
instruction is often relatively slow:
on Intel Skylake vpblendvb
has a throughput of one per cycle and
on AMD Ryzen and Intel Haswell the throughput is only one per two cylcles.
Skylake-X has a fast byte blend vpblendmb
(throughput three per cycle (256-bit) ), although on Skylake-X one might be more
interested in a solution that works with 512-bit vectors instead of 256-bit.
An alternative is to combine vpshufb
with vshufps
, as suggested in @Peter Cordes' comments above.
In the code below the data is loaded as 12-byte chunks. Altogether more instructions are needed than in the first solution.
Nevertheless, the performance of this second solution is probably better than the first solution, depending on the surrounding code
and the micro architecture.
#include <stdio.h>
#include <x86intrin.h>
/* gcc -O3 -Wall -m64 -march=broadwell stride_3.c */
int __attribute__ ((noinline)) print_vec_char(__m256i x);
inline __m256i _mm256_shufps_epi32(__m256i a,__m256i b,int imm){return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b),imm));}
int main() {
char *m;
int i;
__m256i p0 = _mm256_set_epi8(-1,-1,-1,-1, 11,8,5,2, 10,7,4,1, 9,6,3,0, -1,-1,-1,-1, 11,8,5,2, 10,7,4,1, 9,6,3,0);
__m256i p1 = _mm256_set_epi8(11,8,5,2, 10,7,4,1, 9,6,3,0, -1,-1,-1,-1, 11,8,5,2, 10,7,4,1, 9,6,3,0, -1,-1,-1,-1);
__m256i p2 = _mm256_set_epi8(10,7,4,1, 9,6,3,0, -1,-1,-1,-1, 11,8,5,2, 10,7,4,1, 9,6,3,0,-1, -1,-1,-1, 11,8,5,2);
__m256i p3 = _mm256_set_epi8(9,6,3,0, -1,-1,-1,-1, 11,8,5,2, 10,7,4,1, 9,6,3,0, -1,-1,-1,-1, 11,8,5,2, 10,7,4,1);
m = _mm_malloc(96+4,32); /* 4 extra dummy bytes to avoid errors with _mm_loadu_si128((__m128i*)&m[84]) . Otherwise use maskload instead of standard load */
for(i = 0; i < 96; i++) m[i] = i;
// printf("m_lo ");print_vec_char(_mm256_load_si256((__m256i*)&m[0]));printf("m_mid ");print_vec_char(_mm256_load_si256((__m256i*)&m[32]));printf("m_hi ");print_vec_char(_mm256_load_si256((__m256i*)&m[64]));printf("\n");
// m_lo 31 30 29 28 | 27 26 25 24 | 23 22 21 20 | 19 18 17 16 || 15 14 13 12 | 11 10 9 8 | 7 6 5 4 | 3 2 1 0
// m_mid 63 62 61 60 | 59 58 57 56 | 55 54 53 52 | 51 50 49 48 || 47 46 45 44 | 43 42 41 40 | 39 38 37 36 | 35 34 33 32
// m_hi 95 94 93 92 | 91 90 89 88 | 87 86 85 84 | 83 82 81 80 || 79 78 77 76 | 75 74 73 72 | 71 70 69 68 | 67 66 65 64
__m256i t0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&m[0]));
__m256i t1 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&m[12]));
__m256i t2 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&m[24]));
__m256i t3 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i*)&m[36]));
t0 = _mm256_inserti128_si256(t0,_mm_loadu_si128((__m128i*)&m[48]),1);
t1 = _mm256_inserti128_si256(t1,_mm_loadu_si128((__m128i*)&m[60]),1);
t2 = _mm256_inserti128_si256(t2,_mm_loadu_si128((__m128i*)&m[72]),1);
t3 = _mm256_inserti128_si256(t3,_mm_loadu_si128((__m128i*)&m[84]),1); /* Use a masked load (_mm_maskload_epi32) here if m[99] is not a valid address */
// printf("t0 ");print_vec_char(t0);printf("t1 ");print_vec_char(t1);printf("t2 ");print_vec_char(t2);printf("t3 ");print_vec_char(t3);printf("\n");
// t0 63 62 61 60 | 59 58 57 56 | 55 54 53 52 | 51 50 49 48 || 15 14 13 12 | 11 10 9 8 | 7 6 5 4 | 3 2 1 0
// t1 75 74 73 72 | 71 70 69 68 | 67 66 65 64 | 63 62 61 60 || 27 26 25 24 | 23 22 21 20 | 19 18 17 16 | 15 14 13 12
// t2 87 86 85 84 | 83 82 81 80 | 79 78 77 76 | 75 74 73 72 || 39 38 37 36 | 35 34 33 32 | 31 30 29 28 | 27 26 25 24
// t3 0 0 0 0 | 95 94 93 92 | 91 90 89 88 | 87 86 85 84 || 51 50 49 48 | 47 46 45 44 | 43 42 41 40 | 39 38 37 36
t0 = _mm256_shuffle_epi8(t0,p0);
t1 = _mm256_shuffle_epi8(t1,p1);
t2 = _mm256_shuffle_epi8(t2,p2);
t3 = _mm256_shuffle_epi8(t3,p3);
// printf("t0 ");print_vec_char(t0);printf("t1 ");print_vec_char(t1);printf("t2 ");print_vec_char(t2);printf("t3 ");print_vec_char(t3);printf("\n");
// t0 0 0 0 0 | 59 56 53 50 | 58 55 52 49 | 57 54 51 48 || 0 0 0 0 | 11 8 5 2 | 10 7 4 1 | 9 6 3 0
// t1 71 68 65 62 | 70 67 64 61 | 69 66 63 60 | 0 0 0 0 || 23 20 17 14 | 22 19 16 13 | 21 18 15 12 | 0 0 0 0
// t2 82 79 76 73 | 81 78 75 72 | 0 0 0 0 | 83 80 77 74 || 34 31 28 25 | 33 30 27 24 | 0 0 0 0 | 35 32 29 26
// t3 93 90 87 84 | 0 0 0 0 | 95 92 89 86 | 94 91 88 85 || 45 42 39 36 | 0 0 0 0 | 47 44 41 38 | 46 43 40 37
__m256i u0 = _mm256_blend_epi32(t0,t1,0b10101010);
__m256i u1 = _mm256_blend_epi32(t2,t3,0b10101010);
__m256i u2 = _mm256_blend_epi32(t0,t1,0b01010101);
__m256i u3 = _mm256_blend_epi32(t2,t3,0b01010101);
// printf("u0 ");print_vec_char(u0);printf("u1 ");print_vec_char(u1);printf("u2 ");print_vec_char(u2);printf("u3 ");print_vec_char(u3);printf("\n");
// u0 71 68 65 62 | 59 56 53 50 | 69 66 63 60 | 57 54 51 48 || 23 20 17 14 | 11 8 5 2 | 21 18 15 12 | 9 6 3 0
// u1 93 90 87 84 | 81 78 75 72 | 95 92 89 86 | 83 80 77 74 || 45 42 39 36 | 33 30 27 24 | 47 44 41 38 | 35 32 29 26
// u2 0 0 0 0 | 70 67 64 61 | 58 55 52 49 | 0 0 0 0 || 0 0 0 0 | 22 19 16 13 | 10 7 4 1 | 0 0 0 0
// u3 82 79 76 73 | 0 0 0 0 | 0 0 0 0 | 94 91 88 85 || 34 31 28 25 | 0 0 0 0 | 0 0 0 0 | 46 43 40 37
t0 = _mm256_blend_epi32(u0,u1,0b11001100);
t1 = _mm256_shufps_epi32(u2,u3,0b00111001);
t2 = _mm256_shufps_epi32(u0,u1,0b01001110);
printf("t0 ");print_vec_char(t0);printf("t1 ");print_vec_char(t1);printf("t2 ");print_vec_char(t2);printf("\n");
// t0 93 90 87 84 | 81 78 75 72 | 69 66 63 60 | 57 54 51 48 || 45 42 39 36 | 33 30 27 24 | 21 18 15 12 | 9 6 3 0
// t1 94 91 88 85 | 82 79 76 73 | 70 67 64 61 | 58 55 52 49 || 46 43 40 37 | 34 31 28 25 | 22 19 16 13 | 10 7 4 1
// t2 95 92 89 86 | 83 80 77 74 | 71 68 65 62 | 59 56 53 50 || 47 44 41 38 | 35 32 29 26 | 23 20 17 14 | 11 8 5 2
return 0;
}
int __attribute__ ((noinline)) print_vec_char(__m256i x){
char v[32];
_mm256_storeu_si256((__m256i *)v,x);
printf("%3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi || ",
v[31],v[30],v[29],v[28],v[27],v[26],v[25],v[24],v[23],v[22],v[21],v[20],v[19],v[18],v[17],v[16]);
printf("%3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi | %3hhi %3hhi %3hhi %3hhi \n",
v[15],v[14],v[13],v[12],v[11],v[10],v[9],v[8],v[7],v[6],v[5],v[4],v[3],v[2],v[1],v[0]);
return 0;
}
Instruction summary:
4 vmovdqu
4 vinserti128-load
4 vpshufb
5 vpblendd (vpblendd is much faster than vpblendvb on most cpu architectures)
2 vshufps
It is easy to adapt the ideas of these methods to the 16-bit case.