Generating fast assembly for complex arithmetic in g++4.4.7

Question

I have a very simple function:

__attribute__((noinline))
void benchmark(cfloat* __restrict__ aa, cfloat* __restrict__ bb, cfloat* __restrict__ cc, cfloat* __restrict__ dd, cfloat uu, cfloat vv, size_t nn) {
    for (ssize_t ii=0; ii < nn; ii++) {
        dd[ii] = (
            aa[ii]*uu +
            bb[ii]*vv +
            cc[ii]
        );
    }
}

That generates very different assembly with g++4.4.7 depending on how I define my cfloat object.

First iteration, if I define my cfloat thusly:

struct cfloat {
    cfloat(float re, float im) : re(re), im(im) {}
    float re,im;
};

cfloat operator +(cfloat a, cfloat b) {
    return cfloat(a.re+b.re, a.im+b.im);
}

cfloat operator *(cfloat a, cfloat b) {
    return cfloat(a.re*b.re-a.im*b.im, a.re*b.im+a.im*b.re);
}

generates this assembly for the benchmark function (compiled with g++ testcx.cc -O3 -o testcx:

   0x00000000004006a0 <+0>: push   %r15
   0x00000000004006a2 <+2>: test   %r8,%r8
   0x00000000004006a5 <+5>: push   %r14
   0x00000000004006a7 <+7>: push   %r13
   0x00000000004006a9 <+9>: push   %r12
   0x00000000004006ab <+11>:    push   %rbp
   0x00000000004006ac <+12>:    push   %rbx
   0x00000000004006ad <+13>:    movq   %xmm0,-0x28(%rsp)
   0x00000000004006b3 <+19>:    mov    %rdi,-0x38(%rsp)
   0x00000000004006b8 <+24>:    mov    -0x28(%rsp),%rax
   0x00000000004006bd <+29>:    movq   %xmm1,-0x28(%rsp)
   0x00000000004006c3 <+35>:    mov    -0x28(%rsp),%r9
   0x00000000004006c8 <+40>:    je     0x4008a0 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+512>
   0x00000000004006ce <+46>:    mov    %r9,%r15
   0x00000000004006d1 <+49>:    mov    %rax,%r14
   0x00000000004006d4 <+52>:    xor    %r11d,%r11d
   0x00000000004006d7 <+55>:    shr    $0x20,%r15
   0x00000000004006db <+59>:    shr    $0x20,%r14
   0x00000000004006df <+63>:    xor    %r10d,%r10d
   0x00000000004006e2 <+66>:    mov    %r15d,-0x2c(%rsp)
   0x00000000004006e7 <+71>:    xor    %ebp,%ebp
   0x00000000004006e9 <+73>:    xor    %ebx,%ebx
   0x00000000004006eb <+75>:    movss  -0x2c(%rsp),%xmm6
   0x00000000004006f1 <+81>:    mov    %r9d,-0x2c(%rsp)
   0x00000000004006f6 <+86>:    movss  -0x2c(%rsp),%xmm5
   0x00000000004006fc <+92>:    mov    %r14d,-0x2c(%rsp)
   0x0000000000400701 <+97>:    movss  -0x2c(%rsp),%xmm4
   0x0000000000400707 <+103>:   mov    %eax,-0x2c(%rsp)
   0x000000000040070b <+107>:   xor    %r13d,%r13d
   0x000000000040070e <+110>:   xor    %r12d,%r12d
   0x0000000000400711 <+113>:   movabs $0xffffffff00000000,%r9
   0x000000000040071b <+123>:   movss  -0x2c(%rsp),%xmm3
   0x0000000000400721 <+129>:   nopl   0x0(%rax)
   0x0000000000400728 <+136>:   lea    0x0(,%r13,8),%rax
   0x0000000000400730 <+144>:   movaps %xmm6,%xmm1
   0x0000000000400733 <+147>:   movaps %xmm5,%xmm7
   0x0000000000400736 <+150>:   and    $0xffffffff,%ebp
   0x0000000000400739 <+153>:   lea    (%rsi,%rax,1),%r15
   0x000000000040073d <+157>:   lea    (%rdx,%rax,1),%r14
   0x0000000000400741 <+161>:   add    -0x38(%rsp),%rax
   0x0000000000400746 <+166>:   and    $0xffffffff,%ebx
   0x0000000000400749 <+169>:   add    $0x1,%r12
   0x000000000040074d <+173>:   movss  (%r15),%xmm0
   0x0000000000400752 <+178>:   movss  0x4(%r15),%xmm2
   0x0000000000400758 <+184>:   mulss  %xmm0,%xmm1
   0x000000000040075c <+188>:   mulss  %xmm2,%xmm7
   0x0000000000400760 <+192>:   mulss  %xmm5,%xmm0
   0x0000000000400764 <+196>:   mulss  %xmm6,%xmm2
   0x0000000000400768 <+200>:   addss  %xmm7,%xmm1
   0x000000000040076c <+204>:   movaps %xmm3,%xmm7
   0x000000000040076f <+207>:   subss  %xmm2,%xmm0
   0x0000000000400773 <+211>:   movd   %xmm1,-0x30(%rsp)
   0x0000000000400779 <+217>:   mov    -0x30(%rsp),%edi
   0x000000000040077d <+221>:   movaps %xmm4,%xmm1
   0x0000000000400780 <+224>:   movd   %xmm0,-0x30(%rsp)
   0x0000000000400786 <+230>:   mov    %edi,%r15d
   0x0000000000400789 <+233>:   mov    -0x30(%rsp),%edi
   0x000000000040078d <+237>:   movss  (%rax),%xmm0
   0x0000000000400791 <+241>:   shl    $0x20,%r15
   0x0000000000400795 <+245>:   movss  0x4(%rax),%xmm2
   0x000000000040079a <+250>:   mulss  %xmm0,%xmm1
   0x000000000040079e <+254>:   or     %r15,%rbp
   0x00000000004007a1 <+257>:   mulss  %xmm2,%xmm7
   0x00000000004007a5 <+261>:   mov    %edi,%r15d
   0x00000000004007a8 <+264>:   and    %r9,%rbp
   0x00000000004007ab <+267>:   mulss  %xmm3,%xmm0
   0x00000000004007af <+271>:   or     %r15,%rbp
   0x00000000004007b2 <+274>:   mulss  %xmm4,%xmm2
   0x00000000004007b6 <+278>:   addss  %xmm7,%xmm1
   0x00000000004007ba <+282>:   subss  %xmm2,%xmm0
   0x00000000004007be <+286>:   movd   %xmm1,-0x30(%rsp)
   0x00000000004007c4 <+292>:   mov    -0x30(%rsp),%edi
   0x00000000004007c8 <+296>:   movd   %xmm0,-0x30(%rsp)
   0x00000000004007ce <+302>:   mov    %edi,%eax
   0x00000000004007d0 <+304>:   mov    -0x30(%rsp),%edi
   0x00000000004007d4 <+308>:   shl    $0x20,%rax
   0x00000000004007d8 <+312>:   or     %rax,%rbx
   0x00000000004007db <+315>:   and    %r9,%rbx
   0x00000000004007de <+318>:   mov    %edi,%eax
   0x00000000004007e0 <+320>:   or     %rax,%rbx
   0x00000000004007e3 <+323>:   mov    %r10,%rax
   0x00000000004007e6 <+326>:   mov    %rbx,%rdi
   0x00000000004007e9 <+329>:   and    $0xffffffff,%eax
   0x00000000004007ec <+332>:   shr    $0x20,%rdi
   0x00000000004007f0 <+336>:   mov    %edi,-0x20(%rsp)
   0x00000000004007f4 <+340>:   mov    %rbp,%rdi
   0x00000000004007f7 <+343>:   shr    $0x20,%rdi
   0x00000000004007fb <+347>:   movss  -0x20(%rsp),%xmm0
   0x0000000000400801 <+353>:   mov    %edi,-0x10(%rsp)
   0x0000000000400805 <+357>:   addss  -0x10(%rsp),%xmm0
   0x000000000040080b <+363>:   mov    %ebp,-0x10(%rsp)
   0x000000000040080f <+367>:   movss  %xmm0,-0x20(%rsp)
   0x0000000000400815 <+373>:   mov    -0x20(%rsp),%r10d
   0x000000000040081a <+378>:   mov    %ebx,-0x20(%rsp)
   0x000000000040081e <+382>:   movss  -0x20(%rsp),%xmm0
   0x0000000000400824 <+388>:   addss  -0x10(%rsp),%xmm0
   0x000000000040082a <+394>:   shl    $0x20,%r10
   0x000000000040082e <+398>:   or     %rax,%r10
   0x0000000000400831 <+401>:   and    %r9,%r10
   0x0000000000400834 <+404>:   movss  %xmm0,-0x20(%rsp)
   0x000000000040083a <+410>:   mov    -0x20(%rsp),%eax
   0x000000000040083e <+414>:   or     %rax,%r10
   0x0000000000400841 <+417>:   mov    %r11,%rax
   0x0000000000400844 <+420>:   mov    %r10,%rdi
   0x0000000000400847 <+423>:   and    $0xffffffff,%eax
   0x000000000040084a <+426>:   shr    $0x20,%rdi
   0x000000000040084e <+430>:   mov    %edi,-0x20(%rsp)
   0x0000000000400852 <+434>:   movss  -0x20(%rsp),%xmm0
   0x0000000000400858 <+440>:   addss  0x4(%r14),%xmm0
   0x000000000040085e <+446>:   movss  %xmm0,-0x20(%rsp)
   0x0000000000400864 <+452>:   mov    -0x20(%rsp),%r11d
   0x0000000000400869 <+457>:   mov    %r10d,-0x20(%rsp)
   0x000000000040086e <+462>:   movss  -0x20(%rsp),%xmm0
   0x0000000000400874 <+468>:   addss  (%r14),%xmm0
   0x0000000000400879 <+473>:   shl    $0x20,%r11
   0x000000000040087d <+477>:   or     %rax,%r11
   0x0000000000400880 <+480>:   and    %r9,%r11
   0x0000000000400883 <+483>:   movss  %xmm0,-0x20(%rsp)
   0x0000000000400889 <+489>:   mov    -0x20(%rsp),%eax
   0x000000000040088d <+493>:   or     %rax,%r11
   0x0000000000400890 <+496>:   cmp    %r8,%r12
   0x0000000000400893 <+499>:   mov    %r11,(%rcx,%r13,8)
   0x0000000000400897 <+503>:   mov    %r12,%r13
   0x000000000040089a <+506>:   jne    0x400728 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+136>
   0x00000000004008a0 <+512>:   pop    %rbx
   0x00000000004008a1 <+513>:   pop    %rbp
   0x00000000004008a2 <+514>:   pop    %r12
   0x00000000004008a4 <+516>:   pop    %r13
   0x00000000004008a6 <+518>:   pop    %r14
   0x00000000004008a8 <+520>:   pop    %r15
   0x00000000004008aa <+522>:   retq

Which is about 133 instructions.

If I define the cfloat like this, with an array as the state:

struct cfloat {
    cfloat(float re, float im) { ri[0] = re; ri[1] = im; }
    float ri[2];
};

cfloat operator +(cfloat a, cfloat b) {
    return cfloat(a.ri[0]+b.ri[0], a.ri[1]+b.ri[1]);
}

cfloat operator *(cfloat a, cfloat b) {
    return cfloat(a.ri[0]*b.ri[0]-a.ri[1]*b.ri[1], a.ri[0]*b.ri[1]+a.ri[1]*b.ri[0]);
}

It generates this assembly:

Dump of assembler code for function _Z9benchmarkP6cfloatS0_S0_S0_S_S_m:
   0x00000000004006a0 <+0>: push   %rbx
   0x00000000004006a1 <+1>: movq   %xmm0,-0x8(%rsp)
   0x00000000004006a7 <+7>: mov    -0x8(%rsp),%r9
   0x00000000004006ac <+12>:    movq   %xmm1,-0x8(%rsp)
   0x00000000004006b2 <+18>:    mov    -0x8(%rsp),%rax
   0x00000000004006b7 <+23>:    mov    %r9d,-0xc(%rsp)
   0x00000000004006bc <+28>:    shr    $0x20,%r9
   0x00000000004006c0 <+32>:    movss  -0xc(%rsp),%xmm9
   0x00000000004006c7 <+39>:    mov    %r9d,-0xc(%rsp)
   0x00000000004006cc <+44>:    movss  -0xc(%rsp),%xmm8
   0x00000000004006d3 <+51>:    mov    %eax,-0xc(%rsp)
   0x00000000004006d7 <+55>:    shr    $0x20,%rax
   0x00000000004006db <+59>:    movss  -0xc(%rsp),%xmm7
   0x00000000004006e1 <+65>:    test   %r8,%r8
   0x00000000004006e4 <+68>:    mov    %eax,-0xc(%rsp)
   0x00000000004006e8 <+72>:    movss  -0xc(%rsp),%xmm6
   0x00000000004006ee <+78>:    je     0x400796 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+246>
   0x00000000004006f4 <+84>:    xor    %eax,%eax
   0x00000000004006f6 <+86>:    xor    %r9d,%r9d
   0x00000000004006f9 <+89>:    nopl   0x0(%rax)
   0x0000000000400700 <+96>:    shl    $0x3,%rax
   0x0000000000400704 <+100>:   movaps %xmm7,%xmm0
   0x0000000000400707 <+103>:   lea    (%rsi,%rax,1),%rbx
   0x000000000040070b <+107>:   movaps %xmm6,%xmm3
   0x000000000040070e <+110>:   lea    (%rcx,%rax,1),%r10
   0x0000000000400712 <+114>:   lea    (%rdx,%rax,1),%r11
   0x0000000000400716 <+118>:   lea    (%rdi,%rax,1),%rax
   0x000000000040071a <+122>:   movss  (%rbx),%xmm1
   0x000000000040071e <+126>:   add    $0x1,%r9
   0x0000000000400722 <+130>:   movss  0x4(%rbx),%xmm5
   0x0000000000400727 <+135>:   mulss  %xmm1,%xmm0
   0x000000000040072b <+139>:   mulss  %xmm5,%xmm3
   0x000000000040072f <+143>:   movss  (%rax),%xmm2
   0x0000000000400733 <+147>:   movaps %xmm8,%xmm10
   0x0000000000400737 <+151>:   mulss  %xmm6,%xmm1
   0x000000000040073b <+155>:   movss  0x4(%rax),%xmm4
   0x0000000000400740 <+160>:   mulss  %xmm7,%xmm5
   0x0000000000400744 <+164>:   mulss  %xmm4,%xmm10
   0x0000000000400749 <+169>:   cmp    %r8,%r9
   0x000000000040074c <+172>:   mov    %r9,%rax
   0x000000000040074f <+175>:   subss  %xmm3,%xmm0
   0x0000000000400753 <+179>:   movaps %xmm2,%xmm3
   0x0000000000400756 <+182>:   mulss  %xmm9,%xmm4
   0x000000000040075b <+187>:   mulss  %xmm9,%xmm3
   0x0000000000400760 <+192>:   addss  %xmm5,%xmm1
   0x0000000000400764 <+196>:   mulss  %xmm8,%xmm2
   0x0000000000400769 <+201>:   subss  %xmm10,%xmm3
   0x000000000040076e <+206>:   addss  %xmm4,%xmm2
   0x0000000000400772 <+210>:   addss  %xmm3,%xmm0
   0x0000000000400776 <+214>:   addss  %xmm2,%xmm1
   0x000000000040077a <+218>:   addss  (%r11),%xmm0
   0x000000000040077f <+223>:   addss  0x4(%r11),%xmm1
   0x0000000000400785 <+229>:   movss  %xmm0,(%r10)
   0x000000000040078a <+234>:   movss  %xmm1,0x4(%r10)
   0x0000000000400790 <+240>:   jne    0x400700 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+96>
   0x0000000000400796 <+246>:   pop    %rbx
   0x0000000000400797 <+247>:   retq   
End of assembler dump.

Which is about 59 instructions. And, my benchmarks show, the first iteration is about 3x slower than the second.

I would prefer the separate real/imaginary fields, not least because having them as an array seems to break the vectorizer in Intel's compiler for some reason.

Is there any way I can convince gcc that these two classes are equivalent?

what hardware are you compiling for? I would expect fused-multiply-add operations to obtain best performance here. — Walter, Jul 21 '17 at 14:44
Generic x86-64. It turns out of I turn that "ssize_t" into "size_t" in my loop, g++ will actually vectorize the first example, but not the second. — gct, Jul 21 '17 at 14:45
This looks like a duplicate of a deleted question: https://stackoverflow.com/questions/45134703/why-does-g4-4-7-generate-much-poorer-code-for-user-built-complex-number-than-b — Florian Weimer, Jul 21 '17 at 14:49
gcc 4.4 is old. I've checked your code with gcc 6.3, and it generates the exact same assembly for both your versions. — geza, Jul 21 '17 at 14:55
@monster, if I recall correctly, it was basically identical to this question. — Florian Weimer, Jul 21 '17 at 14:56
@DanielH std::complex has some issues, not least of which is it inherited the poor default complex arithmetic semantics from c99. — gct, Jul 21 '17 at 15:00
@Florian that was mine, I cleaned everything up to be simpler to understand and resubmitted. — gct, Jul 21 '17 at 15:00
@geza agreed, but it's also the default on RHEL6, which I have to support. — gct, Jul 21 '17 at 15:01
@SeanMcAllister, are you targeting anything else besides Red Hat Enterprise Linux 6? — Florian Weimer, Jul 21 '17 at 15:03
@FlorianWeimer RHEL7 (and ideally future releases), and the latest Intel compiler — gct, Jul 21 '17 at 15:04
Out of curiosity, have you tried using `std::complex` data type? The compiler may have some optimized code for using complex numbers. — Thomas Matthews, Jul 21 '17 at 15:04
@Thomas extensively, std::complex uses c99 __complex type under the hood, which has issues for what I'm doing. Wrote an article on it: https://medium.com/@smcallis_71148/complex-arithmetic-is-complicated-873ec0c69fc5 — gct, Jul 21 '17 at 15:11
If I put re, im into a union with a float[2], it seems that gcc generates better code. **Seems**, I haven't checked it too much. — geza, Jul 21 '17 at 15:15
@SeanMcAllister Correct handling of infinities is not poor default semantics. Besides, based on that article, the built-in type is always at least as fast if you turn off the correctness check, and it’s sometimes faster. Why do you not want to use it? EDIT: Unless this is a header-only library and you can’t even provide a Makefile. But then how are you sure that the people *using* the library prefer speed over correctness *and* don’t already compile with `-ffast-math` or equivalent? — Daniel H, Jul 21 '17 at 15:52
@Daniel, I disagree. The C99 guys made up the semantics for complex infinities, which means, amongst other things that inf+nan*i is consider infinite, for some sense of consistency that _no one_ would miss. If we're making up semantics, we might as well make up fast ones. — gct, Jul 21 '17 at 16:17
@SeanMcAllister Even if you are correct that nobody would miss that, there are other edge cases the builtin types should handle (see [this question](https://stackoverflow.com/q/23519366/27302), for example, where Clang doesn’t handle this correctly). If you can at all get away with providing a Makefile or something to specify `-fcx-fortran-rules`, that is probably the way to go. — Daniel H, Jul 21 '17 at 19:39

score 1 · Accepted Answer · answered Jul 21 '17 at 15:24

So I don't believe this, but if I specify an explicit copy constructor, the problem resolves itself:

struct cfloat {
    cfloat(float re, float im) : re(re),   im(im)   {}
    cfloat(const cfloat& o)    : re(o.re), im(o.im) {}

    float re,im;
};

Now generates the same assembly:

Dump of assembler code for function benchmark(cfloat*, cfloat*, cfloat*, cfloat*, cfloat, cfloat, unsigned long):
   0x0000000000400600 <+0>: mov    0x8(%rsp),%r10
   0x0000000000400605 <+5>: test   %r10,%r10
   0x0000000000400608 <+8>: je     0x4006aa <benchmark(cfloat*, cfloat*, cfloat*, cfloat*, cfloat, cfloat, unsigned long)+170>
   0x000000000040060e <+14>:    xor    %eax,%eax
   0x0000000000400610 <+16>:    movss  (%r9),%xmm8
   0x0000000000400615 <+21>:    movss  0x4(%r9),%xmm9
   0x000000000040061b <+27>:    movaps %xmm8,%xmm0
   0x000000000040061f <+31>:    movaps %xmm9,%xmm3
   0x0000000000400623 <+35>:    movss  (%rsi,%rax,8),%xmm1
   0x0000000000400628 <+40>:    movss  0x4(%rsi,%rax,8),%xmm7
   0x000000000040062e <+46>:    mulss  %xmm1,%xmm0
   0x0000000000400632 <+50>:    mulss  %xmm7,%xmm3
   0x0000000000400636 <+54>:    movss  (%r8),%xmm5
   0x000000000040063b <+59>:    movss  0x4(%r8),%xmm6
   0x0000000000400641 <+65>:    mulss  %xmm9,%xmm1
   0x0000000000400646 <+70>:    movaps %xmm6,%xmm10
   0x000000000040064a <+74>:    mulss  %xmm8,%xmm7
   0x000000000040064f <+79>:    movss  (%rdi,%rax,8),%xmm2
   0x0000000000400654 <+84>:    subss  %xmm3,%xmm0
   0x0000000000400658 <+88>:    movaps %xmm5,%xmm3
   0x000000000040065b <+91>:    movss  0x4(%rdi,%rax,8),%xmm4
   0x0000000000400661 <+97>:    mulss  %xmm2,%xmm3
   0x0000000000400665 <+101>:   addss  %xmm7,%xmm1
   0x0000000000400669 <+105>:   mulss  %xmm4,%xmm10
   0x000000000040066e <+110>:   mulss  %xmm6,%xmm2
   0x0000000000400672 <+114>:   mulss  %xmm5,%xmm4
   0x0000000000400676 <+118>:   subss  %xmm10,%xmm3
   0x000000000040067b <+123>:   addss  %xmm4,%xmm2
   0x000000000040067f <+127>:   addss  %xmm3,%xmm0
   0x0000000000400683 <+131>:   addss  %xmm2,%xmm1
   0x0000000000400687 <+135>:   addss  (%rdx,%rax,8),%xmm0
   0x000000000040068c <+140>:   addss  0x4(%rdx,%rax,8),%xmm1
   0x0000000000400692 <+146>:   movss  %xmm0,(%rcx,%rax,8)
   0x0000000000400697 <+151>:   movss  %xmm1,0x4(%rcx,%rax,8)
   0x000000000040069d <+157>:   add    $0x1,%rax
   0x00000000004006a1 <+161>:   cmp    %rax,%r10
   0x00000000004006a4 <+164>:   ja     0x400610 <benchmark(cfloat*, cfloat*, cfloat*, cfloat*, cfloat, cfloat, unsigned long)+16>
   0x00000000004006aa <+170>:   repz retq 
End of assembler dump.

Find me that in the spec.

I guess that makes sense in some perverse way. The problem with the other code is that GCC tries *way* too hard to handle the whole struct in a 64bit GPR, with the explicit copy ctor it gets confused enough that it thinks it can't do it anymore and that fixes the codegen more or less by accident. — harold, Jul 21 '17 at 15:29
Adding the copy constructor probably changes the applicable ABI layout rules (it's no longer a [POD type for the purpose of layout](https://itanium-cxx-abi.github.io/cxx-abi/abi.html#POD)). — Florian Weimer, Jul 21 '17 at 15:36
Yep. For example, look at `operator+`. In the first case, xmmX used for input/output, while in the "fixed" case, cfloat passed by value on stack, and output written with RVO. — geza, Jul 21 '17 at 15:39

score 0 · Answer 2 · answered Jul 21 '17 at 15:07

0

You mentioned that you target Red Hat Enterprise Linux, and (in your deleted post) that newer compiler versions generate better code. You could use Developer Toolset to get a newer compiler, creating applications which are compatible with the rest of the operating system:

answered Jul 21 '17 at 15:07

Florian Weimer

32,022
3
48
92

Unfortunately upgrading isn't an option for me, I deliver source and don't control the machines. – gct Jul 21 '17 at 15:12
Can you influence the build process? Using DTS does not introduce any additional run-time dependencies, and you don't have to change how the application is deployed. – Florian Weimer Jul 21 '17 at 15:14
No, unfortunately I deliver source code and don't generally control the compiler used to build it (ANSI C++ only for us) – gct Jul 21 '17 at 15:18
Besides, the dev toolset code isn’t ABI compatible with other C++ libraries targeting RHEL unless you add a special flag, because of changes in the layout of `std::string` and `std::list` in C++11. – Daniel H Jul 21 '17 at 15:55
2

@DanielH, that's not correct. GCC is configured with `--with-default-libstdcxx-abi=gcc4-compatible`, so the ABI is backwards-compatible by default. – Florian Weimer Jul 21 '17 at 16:06
@FlorianWeimer I see [you’re right](https://git.centos.org/blob/rpms!devtoolset-6-gcc.git/2b6d80377d16c4fa66084fab923c95760e8e10a0/SPECS!gcc.spec#L766). That’s odd; during my tests that didn’t seem to be the case. It isn’t relevant to this question, and it isn’t relevant to me any more because there’s no devtoolset with GCC 7, but I’m not sure what went wrong with my earlier tests now. Either way, ABI compatibility isn’t guaranteed by Red Hat, but with that in place it should exist. – Daniel H Jul 21 '17 at 16:23

Generating fast assembly for complex arithmetic in g++4.4.7

2 Answers2