I've met a set of code with the following "kernel" as performance blocker. Since I have access to the latest Intel(R) Xeon Phi(TM) CPU 7210 (KNL), I wish to speed it up using AVX512 intrinsic.
for( int y = starty; y <= endy; y++)
{
// hence data[][] is "unsigned char" while result[] is "int"
for( int x = startx; x <= endx; x++)
{
if( (data[y][x]&0x1) == 0 )
result[x] += data[y][x];
}
}
after analyzing the code's behavior, I've found that the inner loop's length is mostly less than 16, so that I've written the following
register int xlen = xend - xstart + 1;
__m512i zero5 = _mm512_setzero_si512();
__m256i zero2 = _mm512_castsi512_si256(zero5);
__m128i zero1 = _mm512_castsi512_si128(zero5);
__m256i mask2 = _mm256_set1_epi8(0x1);
__m128i mask1 = _mm256_castsi256_si128(mask2);
register __m512i psprof0 = zero5;
for( int i = 0; i < (16-xlen)&(~0x1); i += 2 ) mask1 = _mm_srli_si128(mask1, 2);
if( (16-xlen)&(0x1) ) mask1 = _mm_srli_si128(mask1, 1);
#pragma vector nontemporal
#pragma prefetch data
for( int y = starty; y <= endy; y++ )
{
__m128i pixel16 = _mm_loadu_si128((__m128i*)&data[y][startx]);
// if ( _mm_testc_si128(pixel16, mask1) ) continue;
__m128i mask16 = _mm_andnot_si128(pixel16, mask1);
__m128i pixel16n = _mm_sign_epi8(pixel16, mask16);
psprof0 = _mm512_add_epi32(psprof0, _mm512_cvtepu8_epi32(pixel16n));
}
_mm512_storeu_si512(&result[startx], psprof0);
A few questions here:
- Since _mm_srli_si128 does not accept non-immediate parameter, I have to use a loop there, is there any way to eliminate it please?
- _mm_testc_si128(pixel16, mask1) mostly does not help with performance, which of course is due to the distribution of data[][]; However, it "Compute the bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, otherwise set CF to 0", is there any way to get the result of "ANDNOT" so that I do not need to calculate _mm_andnot_si128 again?
- Since the inner loop length is mostly less than 16, it might not be well-suited for AVX512; However, will it be worthy to unroll y interval by 2, by loading data[y][x] and data[y+1][x], and then combining them into one __m256i make sense? However, since 8bit int to 16bit int conversion is not yet available on KNL (AVX512BW), it might be more frustrating than the current version.
- In general, any recommendations/suggestions to enhance performance on this small segment of code on KNL is highly appreciated :) (It is already within OpenMP loop region so that might not be available now)
Point 3 above:
static inline __m256i vec_256_combine_128(__m128i a, __m128i b)
{
// combine two __m128i into one __m256i
return _mm256_insertf128_si256(_mm256_castsi128_si256(a), b, 1);
}
static inline __m128i vec_256_add_128(__m256i a)
{
// add lower 128bit and higher 128bit of __m256i consists of epi16
return _mm_add_epi16(_mm256_castsi256_si128(a), _mm256_extracti128_si256(a, 1));
}
for( int y = starty; y <= endy; y += 2 )
{
__m128i pixel16a = _mm_load_si128((__m128i*)&pEdgeImage[y][sx]);
__m128i pixel16b = _mm_load_si128((__m128i*)&pEdgeImage[y+1][sx]);
if ( y == ye )
pixel16b = zero1;
__m256i pixel16 = vec_256_combine_128(pixel16a, pixel16b);
if ( _mm256_testc_si256(pixel16, mask1) ) continue;
__m256i mask16 = _mm256_andnot_si256(pixel16, mask1);
__m256i pixel16n = _mm256_sign_epi8(pixel16, mask16);
__m256i pixel16lo = _mm256_unpacklo_epi8(pixel16n, zero2);
__m256i pixel16hi = _mm256_unpackhi_epi8(pixel16n, zero2);
psprof0 = _mm256_add_epi16(psprof0, vec_256_combine_128(vec_256_add_128(pixel16lo), vec_256_add_128(pixel16hi)));
}