Bitwise-AND Slower with SIMD than Scalar

Question

I've been trying to improve the performance of large (multi-gigabyte) bitarray operations. I'm no SIMD expert, but it appears SIMD is, in all cases, slower than scalar operations. I've tried several optimizations, including loop unrolling, to no avail. Based on the assembly, it appears it's because scalars are able to use the registers. But, if I'm doing something stupid, please let me know. Otherwise, I'm happy to keep scalars... it's much, much simpler.

/* gcc -Wall -O3 bitwise-and.c -o bitwise-and -m64 -fomit-frame-pointer -mtune=nocona -msse2 */

#ifdef ENABLE_PREFETCH
#warning "SIMD PREFETCHING ENABLED"
#else
#warning "SIMD PREFETCHING DISABLED"
#endif

#ifdef ENABLE_SIMD_UNROLLING
#warning "UNROLLING SIMD"
#else
#warning "NOT UNROLLING SIMD"
#endif

#ifdef AVOID_TEMP_VARS
#warning "AVOIDING SIMD TEMPORARY VARIABLES"
#else
#warning "USING SIMD TEMPORARY VARIABLES"
#endif

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include <signal.h>
#include <setjmp.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <emmintrin.h>
#include <xmmintrin.h>
#include <assert.h>

#define __forceinline __attribute__((always_inline))

double
microtime (void)
{
  struct timeval time;
  gettimeofday(&time, NULL);

  return (double) time.tv_sec * 1E6 + (double) time.tv_usec;
}

__forceinline void
simd_bitwise_and (unsigned char *dst, const unsigned char *src, unsigned block_size)
{
  const __m128i *wrd_ptr = (__m128i *) src;
  const __m128i *wrd_end = (__m128i *) (src + block_size);
  __m128i *dst_ptr = (__m128i *) dst;

  _mm_empty();

  do
  {
    __m128i xmm1;
    __m128i xmm2;

#ifdef ENABLE_SIMD_UNROLLING

# ifdef ENABLE_PREFETCH
    _mm_prefetch((src + 512), _MM_HINT_NTA);
# endif

    xmm1 = _mm_load_si128(wrd_ptr++);
    xmm2 = _mm_load_si128(dst_ptr);
    xmm1 = _mm_and_si128(xmm1, xmm2);
    _mm_store_si128(dst_ptr++, xmm1);

    xmm1 = _mm_load_si128(wrd_ptr++);
    xmm2 = _mm_load_si128(dst_ptr);
    xmm1 = _mm_and_si128(xmm1, xmm2);
    _mm_store_si128(dst_ptr++, xmm1);

    xmm1 = _mm_load_si128(wrd_ptr++);
    xmm2 = _mm_load_si128(dst_ptr);
    xmm1 = _mm_and_si128(xmm1, xmm2);
    _mm_store_si128(dst_ptr++, xmm1);

    xmm1 = _mm_load_si128(wrd_ptr++);
    xmm2 = _mm_load_si128(dst_ptr);
    xmm1 = _mm_and_si128(xmm1, xmm2);
    _mm_store_si128(dst_ptr++, xmm1);
#else
# ifdef AVOID_TEMP_VARS
    xmm1 = _mm_and_si128(*dst_ptr, *wrd_ptr);
# else
    xmm1 = _mm_load_si128(wrd_ptr);
    xmm2 = _mm_load_si128(dst_ptr);
    xmm1 = _mm_and_si128(xmm1, xmm2);
# endif
    _mm_store_si128(dst_ptr, xmm1);
    ++dst_ptr;
    ++wrd_ptr;
#endif
  } while (wrd_ptr < wrd_end);
}

__forceinline void
word_bitwise_and (unsigned char *dst, const unsigned char *src, unsigned block_size)
{
  unsigned int *wrd_ptr = (unsigned int *) src;
  unsigned int *wrd_end = (unsigned int *) (src + block_size);
  unsigned int *dst_ptr = (unsigned int *) dst;
  do
  {
    dst_ptr[0] &= wrd_ptr[0];
    dst_ptr[1] &= wrd_ptr[1];
    dst_ptr[2] &= wrd_ptr[2];
    dst_ptr[3] &= wrd_ptr[3];

    dst_ptr += 4;
    wrd_ptr += 4;
  } while (wrd_ptr < wrd_end);
}

int
main (int argc, char **argv)
{
  unsigned char *dest;
  unsigned char *key1;
  unsigned char *key2;
  size_t minlen = (1024UL * 1024UL * 512UL);
  double start_time = 0.0f;
  double end_time = 0.0f;

  posix_memalign((void *) &key1, sizeof(__m128i), minlen);
  posix_memalign((void *) &key2, sizeof(__m128i), minlen);
  posix_memalign((void *) &dest, sizeof(__m128i), minlen);

  key1[128] = 0xff;
  key2[128] = 0x03;

  // 128-bit SIMD Bitwise AND
  memcpy(dest, key1, minlen);
  start_time = microtime();
  simd_bitwise_and(dest, key2, minlen);
  end_time = microtime();
  printf("Elapsed: %8.6fs\n", (end_time - start_time));
  assert(0x03 == dest[128]);

  // 4xWORD Bitwise AND
  memcpy(dest, key1, minlen);
  start_time = microtime();
  word_bitwise_and(dest, key2, minlen);
  end_time = microtime();
  printf("Elapsed: %8.6fs\n", (end_time - start_time));
  assert(0x03 == dest[128]);

  free(dest);
  free(key2);
  free(key1);

  return EXIT_SUCCESS;
}

/* vi: set et sw=2 ts=2: */

If you're using a relatively current version of gcc then it's probably vectorizing the scalar routine and doing a better job of it than your explicitly coded SIMD. Check the assembler output (`gcc -S ...`) to confirm. Note also that you're probably going to be memory bandwidth limited however you implement this, assuming your data set is larger than L3. — Paul R, Sep 19 '13 at 20:48
Thanks for the suggestion, Paul. I thought that may have been the case as well but, alas, it's not vectorizing it. — Jonah H. Harris, Sep 19 '13 at 20:59
What version of gcc are you using ? Also what CPU are you using ? (AMD or Intel ? Recent or not-so-recent ?) — Paul R, Sep 19 '13 at 21:02
I've tried GCC 4.2.1 and 4.1.2 on a Core i7 and X5680, respectively. The same results were found with both. — Jonah H. Harris, Sep 19 '13 at 21:07
@JonahH.Harris now that this problem is solved, perhaps you can try a streaming store, `_mm_stream_si128` — harold, Sep 20 '13 at 09:17

Paul R · Accepted Answer · 2016-04-24T06:23:54.550

What's happening here is that you're being bitten by lazy allocation of virtual memory. If you change your code to this:

  // 128-bit SIMD Bitwise AND
  memcpy(dest, key1, minlen);
  start_time = microtime();
  simd_bitwise_and(dest, key2, minlen);
  end_time = microtime();
  printf("SIMD Elapsed  : %8.6fs\n", (end_time - start_time));
  assert(0x03 == dest[128]);

  // 4xWORD Bitwise AND
  memcpy(dest, key1, minlen);
  start_time = microtime();
  word_bitwise_and(dest, key2, minlen);
  end_time = microtime();
  printf("Scalar Elapsed: %8.6fs\n", (end_time - start_time));
  assert(0x03 == dest[128]);

  // 128-bit SIMD Bitwise AND
  memcpy(dest, key1, minlen);
  start_time = microtime();
  simd_bitwise_and(dest, key2, minlen);
  end_time = microtime();
  printf("SIMD Elapsed  : %8.6fs\n", (end_time - start_time));
  assert(0x03 == dest[128]);

  // 4xWORD Bitwise AND
  memcpy(dest, key1, minlen);
  start_time = microtime();
  word_bitwise_and(dest, key2, minlen);
  end_time = microtime();
  printf("Scalar Elapsed: %8.6fs\n", (end_time - start_time));
  assert(0x03 == dest[128]);

you should see results something like this:

$ ./bitwise-and 
SIMD Elapsed  : 630061.000000s
Scalar Elapsed: 228156.000000s
SIMD Elapsed  : 182645.000000s
Scalar Elapsed: 202697.000000s
$

Explanation: the first time you iterate through your large memory allocations you are generating page faults, as previously unused pages get wired in. This gives an artificially high time for the first benchmark, which happens to be the SIMD benchmark. On the second and subsequent benchmarks the pages are all wired in and you get a more accurate benchmark, and as expected the SIMD routine is slightly faster than the scalar routine. The difference is not as large as might be expected, as you are executing only one ALU instruction for every 2 loads + 1 store, so performance is limited by DRAM bandwidth rather than computational efficiency.

As a general rule when writing benchmarking code: always call the benchmark routine at least once prior to any actual timing measurements, so that all memory allocations are properly wired in. After that run the benchmark routine a number of times in a loop and ignore any outliers.

Excellent call, Paul! I didn't even think about that. Any suggestions for improving it? — Jonah H. Harris, Sep 19 '13 at 21:17
Yes, I've just been adding some further comments to the answer. Ideally you should "prime" all your memory allocations once, by running the benchmark routine without actually doing any timing, then run it in a loop say, 10 times, and take the minimum time. — Paul R, Sep 19 '13 at 21:18
Oh, and if you mean improving *performance*: no, once you saturate the available DRAM bandwidth you're screwed. However if you are performing more than one operation on the same data then you should combine them, e.g. don't do one pass where you just do, say, an XOR followed by another pass where you do an AND - do the XOR and the AND in the same pass, so that you reduce the memory bandwidth requirement. — Paul R, Sep 19 '13 at 21:24

Bitwise-AND Slower with SIMD than Scalar

1 Answers1