Why O3 optimization does not improve the performance when using float type?

Question

I compiled the corresponding C implementation of two float and int matrix multiplication program when I compile them in O2 almost every thing is the same but when I use O3 flag to use auto vectorization capability both of them yield variant speedups. I see the assembly out put and found out the differences but I don't know why GCC compiled like this? what is the reason and differences between float type and int type ?

Before the multiplication I transposed the second matrix because of some reasons. size of the matrices are 128x128 and the speed up of O2 scalar int implementation is 5.4 over the same implementation when I enable O3 flag and for float implementation speedup is a bite worse almost 0.94.
Int assembly out put:

.L2:
    vmovdqa 448(%rdi), %ymm0
    movl    $c_tra, %eax
    movq    %r8, %rdx
    vmovdqa (%rdi), %ymm15
    vmovdqa %ymm0, -48(%rsp)
    vmovdqa 480(%rdi), %ymm0
    vmovdqa 32(%rdi), %ymm14
    vmovdqa 64(%rdi), %ymm13
    vmovdqa 96(%rdi), %ymm12
    vmovdqa 128(%rdi), %ymm11
    vmovdqa 160(%rdi), %ymm10
    vmovdqa 192(%rdi), %ymm9
    vmovdqa 224(%rdi), %ymm8
    vmovdqa 256(%rdi), %ymm7
    vmovdqa 288(%rdi), %ymm6
    vmovdqa 320(%rdi), %ymm5
    vmovdqa 352(%rdi), %ymm4
    vmovdqa 384(%rdi), %ymm3
    vmovdqa 416(%rdi), %ymm2
    vmovdqa %ymm0, -80(%rsp)
    .p2align 4,,10
    .p2align 3
.L5:
    vpmulld 32(%rax), %ymm14, %ymm0
    vpmulld (%rax), %ymm15, %ymm1
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 64(%rax), %ymm13, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 96(%rax), %ymm12, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 128(%rax), %ymm11, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 160(%rax), %ymm10, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 192(%rax), %ymm9, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 224(%rax), %ymm8, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 256(%rax), %ymm7, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 288(%rax), %ymm6, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 320(%rax), %ymm5, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 352(%rax), %ymm4, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 384(%rax), %ymm3, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vpmulld 416(%rax), %ymm2, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm1
    vmovdqa -48(%rsp), %ymm0
    addq    $512, %rax
    addq    $4, %rdx
    vpmulld -64(%rax), %ymm0, %ymm0
    vpaddd  %ymm0, %ymm1, %ymm0
    vmovdqa -80(%rsp), %ymm1
    vpmulld -32(%rax), %ymm1, %ymm1
    vpaddd  %ymm0, %ymm1, %ymm1
    vmovdqa %xmm1, %xmm0
    vextracti128    $0x1, %ymm1, %xmm1
    vpextrd $1, %xmm0, %esi
    vpextrd $0, %xmm0, %ecx
    addl    %esi, %ecx
    vpextrd $2, %xmm0, %esi
    addl    %esi, %ecx
    vpextrd $3, %xmm0, %esi
    addl    %esi, %ecx
    vpextrd $0, %xmm1, %esi
    addl    %esi, %ecx
    vpextrd $1, %xmm1, %esi
    addl    %esi, %ecx
    vpextrd $2, %xmm1, %esi
    addl    %esi, %ecx
    vpextrd $3, %xmm1, %esi
    addl    %esi, %ecx
    movl    %ecx, -4(%rdx)
    cmpq    $c_tra+65536, %rax
    jne .L5
    addq    $512, %r8
    addq    $512, %rdi
    cmpq    $c_result+65536, %r8
    jne .L2

Float assembly out put:

  .L2:
    xorl    %esi, %esi
    .p2align 4,,10
    .p2align 3
.L7:
    movq    %rdi, %rsi
    xorl    %eax, %eax
    xorl    %edx, %edx
    salq    $5, %rsi
    .p2align 4,,10
    .p2align 3
.L5:
    vcvtsi2ss   %edx, %xmm0, %xmm0
    vmovss  a(%rcx,%rax), %xmm2
    vfmadd231ss c_tra(%rsi,%rax), %xmm2, %xmm0
    addq    $4, %rax
    vcvttss2si  %xmm0, %edx
    cmpq    $128, %rax
    jne .L5
    vcvtsi2ss   %edx, %xmm0, %xmm0
    vmovss  %xmm0, c_result(%rcx,%rdi)
    addq    $4, %rdi
    cmpq    $128, %rdi
    jne .L7

My guess is that you are doing a reduction and floating point math is not associative (but integer math is) so it cannot vectorize the reduction. Try `-Ofast` which allows associative floating point math. — Z boson, May 04 '16 at 12:02
What seems strange is that the int version is using ymm registers (AVX 256 bits) but the float version is only using xmm (SSE 128 bits). Check your compilation flags to make sure you have enabled AVX on both versions. — GdR, May 04 '16 at 09:34
It's not strange at all. The float version didn't auto-vectorize, so it's using scalar instructions, which of course only use `%xmm` regs. (The `v` VEX-encoded versions of scalar instructions still zeroupper the dest register, like any other VEX-coded instruction not using the full-width vector). — Peter Cordes, May 04 '16 at 18:41
It could be seen that vcvttss2si and vcvtsi2ss restrict the auto vectorization I changed some variable to prevent this conversion and auto vectorization flag Ofast vectorized the program automatically. So the answer is auto-vectorization has a problem with the conversions. — ADMS, Dec 09 '16 at 14:00
Well, but what is the reason that the float version didn't auto-vectorize? — ADMS, May 05 '16 at 21:34
I use `march=native` and I think this enable the `AVX` and `AVX2` because I can compile my explicit vectorization implementation using `AVX` and `AVX2` even it enables `FMA` too. — ADMS, May 04 '16 at 10:47
No, packed conversion from int to float is not a problem for auto-vectorization. The problem is that you need `-ffast-math` (enabled by `-Ofast`) to allow auto-vectorization of FP code when that changes the order of operations, **because FP math is not associative**. Adding up an array in-order is different from adding every 4th or 8th element into that element of a vector accumulator. — Peter Cordes, Jul 09 '17 at 12:46

Why O3 optimization does not improve the performance when using float type?

0 Answers0

Linked