Different assembly when rangifying a simple algorithm

Question

When I was preparing supplementary info for this question, I noticed that “rangified” implementations of a very simple algorithm resulted in important differences (to my eyes) in the resulting assembly, compared with “legacy” implementations.

I expanded the tests a bit, with the following results (GCC 9.1 -O3):

Case 1. Simple for loop (https://godbolt.org/z/rAVaT2)

 #include <vector>

 void foo(std::vector<double> &u, std::vector<double> const &v)
 {
   for (std::size_t i = 0u; i < u.size(); ++i)
     u[i] += v[i];
 }

        mov rdx, QWORD PTR [rdi]
        mov rdi, QWORD PTR [rdi+8]
        sub rdi, rdx
        sar rdi, 3
        je .L1
        mov rcx, QWORD PTR [rsi]
        lea rax, [rcx+15]
        sub rax, rdx
        cmp rax, 30
        jbe .L7
        lea rax, [rdi-1]
        cmp rax, 1
        jbe .L7
        mov rsi, rdi
        xor eax, eax
        shr rsi
        sal rsi, 4
        .L4:
        movupd xmm0, XMMWORD PTR [rcx+rax]
        movupd xmm1, XMMWORD PTR [rdx+rax]
        addpd xmm0, xmm1
        movups XMMWORD PTR [rdx+rax], xmm0
        add rax, 16
        cmp rsi, rax
        jne .L4
        mov rsi, rdi
        and rsi, -2
        and edi, 1
        je .L1
        lea rax, [rdx+rsi*8]
        movsd xmm0, QWORD PTR [rax]
        addsd xmm0, QWORD PTR [rcx+rsi*8]
        movsd QWORD PTR [rax], xmm0
        ret
        .L7:
        xor eax, eax
        .L3:
        movsd xmm0, QWORD PTR [rdx+rax*8]
        addsd xmm0, QWORD PTR [rcx+rax*8]
        movsd QWORD PTR [rdx+rax*8], xmm0
        add rax, 1
        cmp rdi, rax
        jne .L3
        .L1:
        ret

Case 2. std::transform (https://godbolt.org/z/2iZaqo)

#include <algorithm>
#include <vector>

void foo(std::vector<double> &u, std::vector<double> const &v)
{
std::transform(std::begin(u), std::end(u),
               std::begin(v),
               std::begin(u),
               std::plus());
}

        mov rdx, QWORD PTR [rdi]
        mov rax, QWORD PTR [rdi+8]
        mov rsi, QWORD PTR [rsi]
        cmp rax, rdx
        je .L1
        sub rax, 8
        lea rcx, [rsi+15]
        sub rax, rdx
        sub rcx, rdx
        shr rax, 3
        cmp rcx, 30
        jbe .L7
        movabs rcx, 2305843009213693950
        test rax, rcx
        je .L7
        lea rcx, [rax+1]
        xor eax, eax
        mov rdi, rcx
        shr rdi
        sal rdi, 4
        .L4:
        movupd xmm0, XMMWORD PTR [rdx+rax]
        movupd xmm1, XMMWORD PTR [rsi+rax]
        addpd xmm0, xmm1
        movups XMMWORD PTR [rdx+rax], xmm0
        add rax, 16
        cmp rax, rdi
        jne .L4
        mov rdi, rcx
        and rdi, -2
        lea rax, [0+rdi*8]
        add rdx, rax
        add rsi, rax
        cmp rcx, rdi
        je .L1
        movsd xmm0, QWORD PTR [rdx]
        addsd xmm0, QWORD PTR [rsi]
        movsd QWORD PTR [rdx], xmm0
        ret
        .L7:
        xor ecx, ecx
        .L3:
        movsd xmm0, QWORD PTR [rdx+rcx*8]
        addsd xmm0, QWORD PTR [rsi+rcx*8]
        mov rdi, rcx
        movsd QWORD PTR [rdx+rcx*8], xmm0
        add rcx, 1
        cmp rax, rdi
        jne .L3
        .L1:
        ret

Case 3. Range-v3 view::zip (https://godbolt.org/z/0BEkfT)

#define RANGES_ASSERT(...) ((void)0)

#include <algorithm>
#include <range/v3/view/zip.hpp>
#include <vector>

void foo(std::vector<double> &u, std::vector<double> const &v)
{
auto w = ranges::view::zip(u, v);

std::for_each(std::begin(w), std::end(w),
              [](auto &&x) { std::get<0u>(x) += std::get<1u>(x); });
}

        mov rdx, QWORD PTR [rsi]
        mov rsi, QWORD PTR [rsi+8]
        mov rax, QWORD PTR [rdi]
        mov rcx, QWORD PTR [rdi+8]
        cmp rdx, rsi
        je .L1
        cmp rax, rcx
        je .L1
        .L3:
        movsd xmm0, QWORD PTR [rax]
        addsd xmm0, QWORD PTR [rdx]
        add rax, 8
        add rdx, 8
        movsd QWORD PTR [rax-8], xmm0
        cmp rax, rcx
        je .L1
        cmp rdx, rsi
        jne .L3
        .L1:
        ret

Case 4. cmcstl2 ranges::transform (https://godbolt.org/z/MjYO1G)

#include <experimental/ranges/algorithm>
#include <vector>

namespace std
{
namespace ranges = experimental::ranges;
}

void foo(std::vector<double> &u,s td::vector<double> const &v)
{
std::ranges::transform(std::ranges::begin(u), std::ranges::end(u),
                       std::ranges::begin(v), std::ranges::end(v),
                       std::ranges::begin(u),
                       std::plus());
}

        mov r8, QWORD PTR [rsi+8]
        mov rdx, QWORD PTR [rsi]
        mov rax, QWORD PTR [rdi]
        mov rcx, QWORD PTR [rdi+8]
        cmp rdx, r8
        je .L1
        cmp rcx, rax
        jne .L3
        jmp .L1
        .L16:
        cmp rdx, r8
        je .L1
        .L3:
        movsd xmm0, QWORD PTR [rax]
        addsd xmm0, QWORD PTR [rdx]
        add rax, 8
        add rdx, 8
        movsd QWORD PTR [rax-8], xmm0
        cmp rax, rcx
        jne .L16
        .L1:
        ret

I can’t read assembly, but I seem to understand that the assemblies of Case 1 and Case 2 are almost equivalent and involve packed sums, whilst the assembly of the ranges versions (Cases 3 and 4) is much terser, but not vectorized.

I would really love to understand what those differences mean. Do my interpretation of the assembly make any sense? What are the additional instructions in the non-ranges versions? Why are there those differences?

The additional instructions care about head/tail elements of the arrays (vectors), which are either not aligned enough or there is not enough of them to be processed by SIMD instructions. — Daniel Langr, May 16 '19 at 06:41
@DanielLangr Therefore the differences are due to the rangified versions not being vectorized, right? It seems extrange to me that the iterator version of `ranges::transform` didn't get vectorized because I expected it to perform the concept checking and then fall back to the legacy algorithm. — metalfox, May 16 '19 at 06:51
Yes, the ranged versions are not vectorized. You can easily distinguish both cases by observing the increments of `rax`, which is 16 in case of vectorization (size of `xmm` register) and 8 in non-vectorized versions (size of `double`). — Daniel Langr, May 16 '19 at 06:59
I tried to use `-fopt-info-vec-missed` for GCC, and for Case 4, I got: `...cmcstl2/include/stl2/detail/algorithm/transform.hpp:61:33: missed: not vectorized: number of iterations cannot be computed`. You can see the source here: https://github.com/CaseyCarter/cmcstl2/blob/master/include/stl2/detail/algorithm/transform.hpp#L61. For some reason, it's not the case of `std::transform`, where this problem doesn't occur. — Daniel Langr, May 16 '19 at 07:23
@DanielLangr Thanks! Maybe what makes the difference between `std::transform` and `ranges::transform` is that the latter takes into account the length of both vectors, while the former doesn't. — metalfox, May 16 '19 at 07:33
@DanielLangr. It seems that considering the size of both vectors prevents GCC from vectorizing the loop: https://godbolt.org/z/tdO5z6 — metalfox, May 16 '19 at 07:46

Different assembly when rangifying a simple algorithm

0 Answers0