AVX2 slower than SSE on Haswell

Question

I have the following code (normal, SSE and AVX):

int testSSE(const aligned_vector & ghs, const aligned_vector & lhs) {
    int result[4] __attribute__((aligned(16))) = {0};
    __m128i vresult = _mm_set1_epi32(0);
    __m128i v1, v2, vmax;

    for (int k = 0; k < ghs.size(); k += 4) {
        v1 = _mm_load_si128((__m128i *) & lhs[k]);
        v2 = _mm_load_si128((__m128i *) & ghs[k]);
        vmax = _mm_add_epi32(v1, v2);
        vresult = _mm_max_epi32(vresult, vmax);
    }
    _mm_store_si128((__m128i *) result, vresult);
    int mymax = result[0];
    for (int k = 1; k < 4; k++) {
        if (result[k] > mymax) {
            mymax = result[k];
        }
    }
    return mymax;
}

 int testAVX(const aligned_vector & ghs, const aligned_vector & lhs) {
    int result[8] __attribute__((aligned(32))) = {0};
    __m256i vresult = _mm256_set1_epi32(0);
    __m256i v1, v2, vmax;

    for (int k = 0; k < ghs.size(); k += 8) {
        v1 = _mm256_load_si256((__m256i *) & ghs[ k]);
        v2 = _mm256_load_si256((__m256i *) & lhs[k]);
        vmax = _mm256_add_epi32(v1, v2);
        vresult = _mm256_max_epi32(vresult, vmax);
    }
    _mm256_store_si256((__m256i *) result, vresult);
    int mymax = result[0];
    for (int k = 1; k < 8; k++) {
        if (result[k] > mymax) {
            mymax = result[k];
        }
    }
    return mymax;
}

int testNormal(const aligned_vector & ghs, const aligned_vector & lhs) {
    int max = 0;
    int tempMax;
    for (int k = 0; k < ghs.size(); k++) {
        tempMax = lhs[k] + ghs[k];
        if (max < tempMax) {
            max = tempMax;
        }
    }
    return max;
}

All these functions are tested with the following code:

void alignTestSSE() {
    aligned_vector lhs;
    aligned_vector ghs;

    int mySize = 4096;
    int FinalResult;
    int nofTestCases = 1000;
    double time, time1, time2, time3;
    vector<int> lhs2;
    vector<int> ghs2;

    lhs.resize(mySize);
    ghs.resize(mySize);
    lhs2.resize(mySize);
    ghs2.resize(mySize);

    srand(1);
    for (int k = 0; k < mySize; k++) {
        lhs[k] = randomNodeID(1000000);
        lhs2[k] = lhs[k];
        ghs[k] = randomNodeID(1000000);
        ghs2[k] = ghs[k];
    }
    /* Warming UP */
    for (int k = 0; k < nofTestCases; k++) {
        FinalResult = testNormal(lhs, ghs);
    }

    for (int k = 0; k < nofTestCases; k++) {
        FinalResult = testSSE(lhs, ghs);
    }

    for (int k = 0; k < nofTestCases; k++) {
        FinalResult = testAVX(lhs, ghs);
    }

    cout << "===========================" << endl;
    time = timestamp();
    for (int k = 0; k < nofTestCases; k++) {
        FinalResult = testSSE(lhs, ghs);
    }
    time = timestamp() - time;
    time1 = time;
    cout << "SSE took " << time << " s" << endl;
    cout << "SSE Result: " << FinalResult << endl;

    time = timestamp();
    for (int k = 0; k < nofTestCases; k++) {
        FinalResult = testAVX(lhs, ghs);
    }
    time = timestamp() - time;
    time3 = time;
    cout << "AVX took " << time << " s" << endl;
    cout << "AVX Result: " << FinalResult << endl;



    time = timestamp();
    for (int k = 0; k < nofTestCases; k++) {
        FinalResult = testNormal(lhs, ghs);
    }
    time = timestamp() - time;
    cout << "Normal took " << time << " s" << endl;
    cout << "Normal Result: " << FinalResult << endl;
    cout << "SpeedUP SSE= " << time / time1 << " s" << endl;
    cout << "SpeedUP AVX= " << time / time3 << " s" << endl;
    cout << "===========================" << endl;
    ghs.clear();
    lhs.clear();
}

Where

inline double timestamp() {
    struct timeval tp;
    gettimeofday(&tp, NULL);
    return double(tp.tv_sec) + tp.tv_usec / 1000000.;
}

And

typedef vector<int, aligned_allocator<int, sizeof (int)> > aligned_vector;

is an aligned vector using the AlignedAllocator of https://gist.github.com/donny-dont/1471329

I have an intel-i7 haswell 4771, and latest Ubuntu 14.04 64bit and gcc 4.8.2. Everything is up-to-date. I compiled with -march=native -mtune=native -O3 -m64.

Results are:

SSE took 0.000375986 s
SSE Result: 1982689
AVX took 0.000459909 s
AVX Result: 1982689
Normal took 0.00315714 s
Normal Result: 1982689
SpeedUP SSE= 8.39696 s
SpeedUP AVX= 6.8647 s

Which shows that the exact same code is 22% slower on AVX2 than SSE. Am I doing something wrong or is this normal behavior?

Mixing AVX and SSE instructions causes overhead because the chip has to zero out the upper half the registers. I would highly suggest moving your AVX tests into a file you compile with `-mavx` and then using an intrinsic to call `vzeroall` before beginning any floating point in that file — Mgetz, May 06 '14 at 14:45
So long as you compile with `-mavx2` and only use intrinsics (not inline assembly) then you shouldn't incur the AVX-SSE switching penalty. — Paul R, May 06 '14 at 15:22
Yes, it is. Even by removing SSE code completely and compiling with -mavx2 does not accelerate the code. I also tried _mm256_zeroall(); before using AVX instructions. — Alexandros, May 06 '14 at 15:42

Paul R · Accepted Answer · 2014-05-06T16:04:55.440

I converted your code to more vanilla C++ (plain arrays, no vectors, etc), cleaned it up and tested it with auto-vectorization disabled and got reasonable results:

#include <iostream>
using namespace std;

#include <sys/time.h>
#include <cstdlib>
#include <cstdint>

#include <immintrin.h>

inline double timestamp() {
    struct timeval tp;
    gettimeofday(&tp, NULL);
    return double(tp.tv_sec) + tp.tv_usec / 1000000.;
}

int testSSE(const int32_t * ghs, const int32_t * lhs, size_t n) {
    int result[4] __attribute__((aligned(16))) = {0};
    __m128i vresult = _mm_set1_epi32(0);
    __m128i v1, v2, vmax;

    for (int k = 0; k < n; k += 4) {
        v1 = _mm_load_si128((__m128i *) & lhs[k]);
        v2 = _mm_load_si128((__m128i *) & ghs[k]);
        vmax = _mm_add_epi32(v1, v2);
        vresult = _mm_max_epi32(vresult, vmax);
    }
    _mm_store_si128((__m128i *) result, vresult);
    int mymax = result[0];
    for (int k = 1; k < 4; k++) {
        if (result[k] > mymax) {
            mymax = result[k];
        }
    }
    return mymax;
}

int testAVX(const int32_t * ghs, const int32_t * lhs, size_t n) {
    int result[8] __attribute__((aligned(32))) = {0};
    __m256i vresult = _mm256_set1_epi32(0);
    __m256i v1, v2, vmax;

    for (int k = 0; k < n; k += 8) {
        v1 = _mm256_load_si256((__m256i *) & ghs[k]);
        v2 = _mm256_load_si256((__m256i *) & lhs[k]);
        vmax = _mm256_add_epi32(v1, v2);
        vresult = _mm256_max_epi32(vresult, vmax);
    }
    _mm256_store_si256((__m256i *) result, vresult);
    int mymax = result[0];
    for (int k = 1; k < 8; k++) {
        if (result[k] > mymax) {
            mymax = result[k];
        }
    }
    return mymax;
}

int testNormal(const int32_t * ghs, const int32_t * lhs, size_t n) {
    int max = 0;
    int tempMax;
    for (int k = 0; k < n; k++) {
        tempMax = lhs[k] + ghs[k];
        if (max < tempMax) {
            max = tempMax;
        }
    }
    return max;
}

void alignTestSSE() {

    int n = 4096;
    int normalResult, sseResult, avxResult;
    int nofTestCases = 1000;
    double time, normalTime, sseTime, avxTime;

    int lhs[n] __attribute__ ((aligned(32)));
    int ghs[n] __attribute__ ((aligned(32)));

    for (int k = 0; k < n; k++) {
        lhs[k] = arc4random();
        ghs[k] = arc4random();
    }

    /* Warming UP */
    for (int k = 0; k < nofTestCases; k++) {
        normalResult = testNormal(lhs, ghs, n);
    }

    for (int k = 0; k < nofTestCases; k++) {
        sseResult = testSSE(lhs, ghs, n);
    }

    for (int k = 0; k < nofTestCases; k++) {
        avxResult = testAVX(lhs, ghs, n);
    }

    time = timestamp();
    for (int k = 0; k < nofTestCases; k++) {
        normalResult = testNormal(lhs, ghs, n);
    }
    normalTime = timestamp() - time;

    time = timestamp();
    for (int k = 0; k < nofTestCases; k++) {
        sseResult = testSSE(lhs, ghs, n);
    }
    sseTime = timestamp() - time;

    time = timestamp();
    for (int k = 0; k < nofTestCases; k++) {
        avxResult = testAVX(lhs, ghs, n);
    }
    avxTime = timestamp() - time;

    cout << "===========================" << endl;
    cout << "Normal took " << normalTime << " s" << endl;
    cout << "Normal Result: " << normalResult << endl;
    cout << "SSE took " << sseTime << " s" << endl;
    cout << "SSE Result: " << sseResult << endl;
    cout << "AVX took " << avxTime << " s" << endl;
    cout << "AVX Result: " << avxResult << endl;
    cout << "SpeedUP SSE= " << normalTime / sseTime << endl;
    cout << "SpeedUP AVX= " << normalTime / avxTime << endl;
    cout << "===========================" << endl;

}

int main()
{
    alignTestSSE();
    return 0;
}

Test:

$ clang++ -Wall -mavx2 -O3 -fno-vectorize SO_avx.cpp && ./a.out
===========================
Normal took 0.00324106 s
Normal Result: 2143749391
SSE took 0.000527859 s
SSE Result: 2143749391
AVX took 0.000221968 s
AVX Result: 2143749391
SpeedUP SSE= 6.14002
SpeedUP AVX= 14.6015
===========================

I suggest you try the above code, with -fno-vectorize (or -fno-tree-vectorize if using g++), and see if you get similar results. If you do then you can work backwards towards your original code to see where the inconsistency might be coming from.

`-fno-tree-vectorize` is a gcc-only option, to disable vectorization in clang you need `-fno-vectorize` — ismail, May 06 '14 at 15:54
@ismail: well it works with my version of `clang++` - the above is a direct copy and paste from my terminal. I get identical behaviour with `-fno-vectorize` or `-fno-tree-vectorize`. YMMV of course. — Paul R, May 06 '14 at 15:56
Well it probably ignores the flag but whatever, just wanted to note it :) — ismail, May 06 '14 at 16:00
@ismail: no, like I said, I get identical behaviour with either switch - if I leave it out entirely then the scalar code gets vectorized and I get a completely different result. It's probably version-dependent, but I've taken out the `-tree` now anyway, to avoid confusion. Thanks for pointing this out. — Paul R, May 06 '14 at 16:02
Thanks to @PaulR I got it. SSE and Normal run the same for aligned vectors vs arrays, but AVX is two times slower for aligned vectors. On my PC, AVX code with arrays is 1.5 faster than SSE. So, I must probably rewrite the code to use arrays for AVX. +1 and accepted your answer. Thanks — Alexandros, May 06 '14 at 16:31
You might be able to stick with the vectors - try creating local const pointers to the start of the vectors at the beginning of the function, so that you're not referencing the vector directly within the loop. — Paul R, May 06 '14 at 16:38
@PaulR ...creating local const pointers to the start of the vectors. And this is done how? I am not that good with pointers (damn my Java background) — Alexandros, May 06 '14 at 16:42
E.g. `const int32_t * const lp = &lhs[0];` to create the local pointer and then use `lp[k]` rather than `lhs[k]` within the loop. — Paul R, May 06 '14 at 17:17
@PaulR, can you explain why the warm up is necessary? Why would the code run slower the first time it's called than for subsequent calls? — Z boson, May 07 '14 at 08:32
Usually when benchmarking any code it's a good idea to run at least one iteration prior to timing so that the caches are warmed and any VM lazy allocations get wired in. You also want to be sure that all the code is paged in too. Much of this probably doesn't apply for the current code, but it's a good habit to get into. — Paul R, May 07 '14 at 08:54
@PaulR, I agree it's good to run at least one iteration first I am just not sure why it helps (I know why it helps for OpenMP but not for this code). By warm the cache I guess you mean read the values into the cache. That makes sense. What does "VM lazy allocations" mean? — Z boson, May 07 '14 at 09:13
Most virtual memory implementations take a "lazy" approach to allocation, so that when you allocate a large chunk of memory only the first few pages get allocated and wired immediately, and then the rest get taken care of via page faults. This is usually an overall performance win (because programs often allocate more memory than they actually need), but for benchmarking you usually want all this memory wired in before you start timing. — Paul R, May 07 '14 at 09:22
The warmup run would also get the CPU's branch-prediction cache warmed up, since the code is so tight that all the branches probably easily fit in whatever branch history buffer the CPU has. — Peter Cordes, Dec 06 '14 at 13:38

score 6 · Answer 2 · edited May 23 '17 at 11:33

On my machine (core i7-4900M), based on updated code from Paul R, with g++ 4.8.2with 100,000 iterations instead of 1000, I have the following results:

g++ -Wall -mavx2 -O3 -std=c++11 test_avx.cpp && ./a.exe 
SSE took             508,029 us
AVX took           1,308,075 us
Normal took          297,017 us


g++ -Wall -mavx2 -O3 -std=c++11 -fno-tree-vectorize test_avx.cpp && ./a.exe 
SSE took             509,029 us
AVX took           1,307,075 us
Normal took        3,436,197 us

GCC is doing an amazing job optimizing the "Normal" code. Yet the slow performance of the "AVX" code can be explained by the lines below, which requires a full 256 bit store (ouch!) followed by a max search over 8 integers.

_mm256_store_si256((__m256i *) result, vresult);
int mymax = result[0];
for (int k = 1; k < 8; k++) {
  if (result[k] > mymax) {
     mymax = result[k];
  }
}
return mymax;

It is best to continue using AVX intrinsics for the max of 8. I can propose the following changes

v1      = _mm256_permute2x128_si256(vresult,vresult,1);  // from ABCD-EFGH to ????-ABCD
vresult = _mm256_max_epi32(vresult, v1);
v1      = _mm256_permute4x64_epi64(vresult,1);  // from ????-ABCD to ????-??AB
vresult = _mm256_max_epi32(vresult, v1);
v1      = _mm256_shuffle_epi32(vresult,1); // from ????-???AB to ????-???A
vresult = _mm256_max_epi32(vresult, v1);

// no _mm256_extract_epi32 => need extra step
__m128i vres128 = _mm256_extracti128_si256(vresult,0);
return _mm_extract_epi32(vres128,0);

For a fair comparaison, I have also updated the SSE code, I have then:

SSE took             483,028 us
AVX took             258,015 us
Normal took          307,017 us

AVX time has decreased by a factor 5!

+1 I will try your code. For arrays I also saw GCC significantly optimizing the code for Normal. — Alexandros, May 14 '14 at 13:48
You updated the SSE code. Can you provide the corresponding code for this as well? — Alexandros, May 14 '14 at 14:03
v1 = _mm_shuffle_epi32(vresult,0xE); // 00_00_11_10 vresult = _mm_max_epi32(vresult, v1); v1 = _mm_shuffle_epi32(vresult,1); // 00_00_00_01 vresult = _mm_max_epi32(vresult, v1); return _mm_extract_epi32(vresult,0); — user3636086, May 14 '14 at 14:20
Thanks, I will start from that to see, if I get any improvement. — Alexandros, May 14 '14 at 14:20

score 0 · Answer 3 · answered May 03 '17 at 11:12

Doing loop unrolling manually can speed up above SSE/AVX code.

Original version on my i5-5300U:

Normal took 0.347 s
Normal Result: 2146591543
AVX took 0.409 s
AVX Result: 2146591543
SpeedUP AVX= 0.848411

After manual loop unrolling:

Normal took 0.375 s
Normal Result: 2146591543
AVX took 0.297 s
AVX Result: 2146591543
SpeedUP AVX= 1.26263

AVX2 slower than SSE on Haswell

3 Answers3