I've been trying to improve the performance of large (multi-gigabyte) bitarray operations. I'm no SIMD expert, but it appears SIMD is, in all cases, slower than scalar operations. I've tried several optimizations, including loop unrolling, to no avail. Based on the assembly, it appears it's because scalars are able to use the registers. But, if I'm doing something stupid, please let me know. Otherwise, I'm happy to keep scalars... it's much, much simpler.
/* gcc -Wall -O3 bitwise-and.c -o bitwise-and -m64 -fomit-frame-pointer -mtune=nocona -msse2 */
#ifdef ENABLE_PREFETCH
#warning "SIMD PREFETCHING ENABLED"
#else
#warning "SIMD PREFETCHING DISABLED"
#endif
#ifdef ENABLE_SIMD_UNROLLING
#warning "UNROLLING SIMD"
#else
#warning "NOT UNROLLING SIMD"
#endif
#ifdef AVOID_TEMP_VARS
#warning "AVOIDING SIMD TEMPORARY VARIABLES"
#else
#warning "USING SIMD TEMPORARY VARIABLES"
#endif
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include <signal.h>
#include <setjmp.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <emmintrin.h>
#include <xmmintrin.h>
#include <assert.h>
#define __forceinline __attribute__((always_inline))
double
microtime (void)
{
struct timeval time;
gettimeofday(&time, NULL);
return (double) time.tv_sec * 1E6 + (double) time.tv_usec;
}
__forceinline void
simd_bitwise_and (unsigned char *dst, const unsigned char *src, unsigned block_size)
{
const __m128i *wrd_ptr = (__m128i *) src;
const __m128i *wrd_end = (__m128i *) (src + block_size);
__m128i *dst_ptr = (__m128i *) dst;
_mm_empty();
do
{
__m128i xmm1;
__m128i xmm2;
#ifdef ENABLE_SIMD_UNROLLING
# ifdef ENABLE_PREFETCH
_mm_prefetch((src + 512), _MM_HINT_NTA);
# endif
xmm1 = _mm_load_si128(wrd_ptr++);
xmm2 = _mm_load_si128(dst_ptr);
xmm1 = _mm_and_si128(xmm1, xmm2);
_mm_store_si128(dst_ptr++, xmm1);
xmm1 = _mm_load_si128(wrd_ptr++);
xmm2 = _mm_load_si128(dst_ptr);
xmm1 = _mm_and_si128(xmm1, xmm2);
_mm_store_si128(dst_ptr++, xmm1);
xmm1 = _mm_load_si128(wrd_ptr++);
xmm2 = _mm_load_si128(dst_ptr);
xmm1 = _mm_and_si128(xmm1, xmm2);
_mm_store_si128(dst_ptr++, xmm1);
xmm1 = _mm_load_si128(wrd_ptr++);
xmm2 = _mm_load_si128(dst_ptr);
xmm1 = _mm_and_si128(xmm1, xmm2);
_mm_store_si128(dst_ptr++, xmm1);
#else
# ifdef AVOID_TEMP_VARS
xmm1 = _mm_and_si128(*dst_ptr, *wrd_ptr);
# else
xmm1 = _mm_load_si128(wrd_ptr);
xmm2 = _mm_load_si128(dst_ptr);
xmm1 = _mm_and_si128(xmm1, xmm2);
# endif
_mm_store_si128(dst_ptr, xmm1);
++dst_ptr;
++wrd_ptr;
#endif
} while (wrd_ptr < wrd_end);
}
__forceinline void
word_bitwise_and (unsigned char *dst, const unsigned char *src, unsigned block_size)
{
unsigned int *wrd_ptr = (unsigned int *) src;
unsigned int *wrd_end = (unsigned int *) (src + block_size);
unsigned int *dst_ptr = (unsigned int *) dst;
do
{
dst_ptr[0] &= wrd_ptr[0];
dst_ptr[1] &= wrd_ptr[1];
dst_ptr[2] &= wrd_ptr[2];
dst_ptr[3] &= wrd_ptr[3];
dst_ptr += 4;
wrd_ptr += 4;
} while (wrd_ptr < wrd_end);
}
int
main (int argc, char **argv)
{
unsigned char *dest;
unsigned char *key1;
unsigned char *key2;
size_t minlen = (1024UL * 1024UL * 512UL);
double start_time = 0.0f;
double end_time = 0.0f;
posix_memalign((void *) &key1, sizeof(__m128i), minlen);
posix_memalign((void *) &key2, sizeof(__m128i), minlen);
posix_memalign((void *) &dest, sizeof(__m128i), minlen);
key1[128] = 0xff;
key2[128] = 0x03;
// 128-bit SIMD Bitwise AND
memcpy(dest, key1, minlen);
start_time = microtime();
simd_bitwise_and(dest, key2, minlen);
end_time = microtime();
printf("Elapsed: %8.6fs\n", (end_time - start_time));
assert(0x03 == dest[128]);
// 4xWORD Bitwise AND
memcpy(dest, key1, minlen);
start_time = microtime();
word_bitwise_and(dest, key2, minlen);
end_time = microtime();
printf("Elapsed: %8.6fs\n", (end_time - start_time));
assert(0x03 == dest[128]);
free(dest);
free(key2);
free(key1);
return EXIT_SUCCESS;
}
/* vi: set et sw=2 ts=2: */