I have some simple setup, where I noticed that VS compiler seems not smart enough to unroll loop, but other compilers like clang or gcc do so. Do I miss some optimization flag for VS?
#include <cstddef>
struct A
{
double data[4];
double *begin() { return data; }
double *end() { return data + 4; }
double const *begin() const { return data; }
double const *end() const { return data + 4; }
};
double sum_index(A const &a) {
double ret = 0;
for(std::size_t i = 0; i < 4; ++i)
{
ret += a.data[i];
}
return ret;
}
double sum_iter(A const &a) {
double ret = 0;
for(auto const &v : a)
{
ret += v;
}
return ret;
}
I used https://godbolt.org/ compiler explorer to generate assembler code.
gcc 11.2 with -O3:
sum_index(A const&):
pxor xmm0, xmm0
addsd xmm0, QWORD PTR [rdi]
addsd xmm0, QWORD PTR [rdi+8]
addsd xmm0, QWORD PTR [rdi+16]
addsd xmm0, QWORD PTR [rdi+24]
ret
sum_iter(A const&):
movsd xmm1, QWORD PTR [rdi]
addsd xmm1, QWORD PTR .LC0[rip]
movsd xmm0, QWORD PTR [rdi+8]
addsd xmm1, xmm0
movupd xmm0, XMMWORD PTR [rdi+16]
addsd xmm1, xmm0
unpckhpd xmm0, xmm0
addsd xmm0, xmm1
ret
.LC0:
.long 0
.long 0
clang 13.0.1 with -O3:
sum_index(A const&): # @sum_index(A const&)
xorpd xmm0, xmm0
addsd xmm0, qword ptr [rdi]
addsd xmm0, qword ptr [rdi + 8]
addsd xmm0, qword ptr [rdi + 16]
addsd xmm0, qword ptr [rdi + 24]
ret
sum_iter(A const&): # @sum_iter(A const&)
xorpd xmm0, xmm0
addsd xmm0, qword ptr [rdi]
addsd xmm0, qword ptr [rdi + 8]
addsd xmm0, qword ptr [rdi + 16]
addsd xmm0, qword ptr [rdi + 24]
ret
MSVC 19.30 with /O2 (there is no /O3?):
this$ = 8
double const * A::begin(void)const PROC ; A::begin, COMDAT
mov rax, rcx
ret 0
double const * A::begin(void)const ENDP ; A::begin
this$ = 8
double const * A::end(void)const PROC ; A::end, COMDAT
lea rax, QWORD PTR [rcx+32]
ret 0
double const * A::end(void)const ENDP ; A::end
a$ = 8
double sum_index(A const &) PROC ; sum_index, COMDAT
movsd xmm0, QWORD PTR [rcx]
xorps xmm1, xmm1
addsd xmm0, xmm1
addsd xmm0, QWORD PTR [rcx+8]
addsd xmm0, QWORD PTR [rcx+16]
addsd xmm0, QWORD PTR [rcx+24]
ret 0
double sum_index(A const &) ENDP ; sum_index
a$ = 8
double sum_iter(A const &) PROC ; sum_iter, COMDAT
lea rax, QWORD PTR [rcx+32]
xorps xmm0, xmm0
cmp rcx, rax
je SHORT $LN12@sum_iter
npad 4
$LL8@sum_iter:
addsd xmm0, QWORD PTR [rcx]
add rcx, 8
cmp rcx, rax
jne SHORT $LL8@sum_iter
$LN12@sum_iter:
ret 0
double sum_iter(A const &) ENDP ; sum_iter
Obviously there is problem with unrolling the loop for MSVC. Is there some additional optimization flag I have to set?
Thanks for help!