With f0
and f1
as below,
long long b;
void f0(int a) {
a %= 10;
if (a == 0) b += 11;
else if (a == 1) b += 13;
else if (a == 2) b += 17;
else if (a == 3) b += 19;
else if (a == 4) b += 23;
else if (a == 5) b += 29;
else if (a == 6) b += 31;
else if (a == 7) b += 37;
else if (a == 8) b += 41;
else if (a == 9) b += 43;
}
void f1(int a) {
a %= 10;
if (a == 0) b += 11;
else if (a == 1) b += 13;
else if (a == 2) b += 17;
else if (a == 3) b += 19;
else if (a == 4) b += 23;
else if (a == 5) b += 29;
else if (a == 6) b += 31;
else if (a == 7) b += 37;
else if (a == 8) b += 41;
else if (a == 9) b += 43;
else __builtin_unreachable();
}
assuming the argument a
is always positive in the program, the compiler should produce more optimized code for f1
because in f0
, a
can fall through the if-else block when it is negative, so the compiler should produce a default "do nothing and return" code. However in f1
, the possible range of a
is clearly stated with __builtin_unreachable
so that the compiler doesn't have to think when a
is out of range.
However, f1
actually runs slower, so I had a look at the disassembly. This is the control flow part of f0
.
jne .L2
addq $11, b(%rip)
ret
.p2align 4,,10
.p2align 3
.L2:
cmpl $9, %eax
ja .L1
movl %eax, %eax
jmp *.L5(,%rax,8)
.section .rodata
.align 8
.align 4
.L5:
.quad .L1
.quad .L13
.quad .L12
.quad .L11
.quad .L10
.quad .L9
.quad .L8
.quad .L7
.quad .L6
.quad .L4
.text
.p2align 4,,10
.p2align 3
.L4:
addq $43, b(%rip)
.L1:
ret
.p2align 4,,10
.p2align 3
.L6:
addq $41, b(%rip)
ret
.p2align 4,,10
.p2align 3
...
gcc smartly turns the if-else block into a jump table and places the default case L1
inside L4
to save space.
Now have a look at the whole control flow of f1
disassembled.
jne .L42
movq b(%rip), %rax
addq $11, %rax
.L43:
movq %rax, b(%rip)
ret
.p2align 4,,10
.p2align 3
.L42:
movl %eax, %eax
jmp *.L46(,%rax,8)
.section .rodata
.align 8
.align 4
.L46:
.quad .L45
.quad .L54
.quad .L53
.quad .L52
.quad .L51
.quad .L50
.quad .L49
.quad .L48
.quad .L47
.quad .L45
.text
.p2align 4,,10
.p2align 3
.L47:
movq b(%rip), %rax
addq $41, %rax
jmp .L43
.p2align 4,,10
.p2align 3
.L48:
movq b(%rip), %rax
addq $37, %rax
jmp .L43
.p2align 4,,10
.p2align 3
.L49:
movq b(%rip), %rax
addq $31, %rax
jmp .L43
.p2align 4,,10
.p2align 3
.L50:
movq b(%rip), %rax
addq $29, %rax
jmp .L43
.p2align 4,,10
.p2align 3
.L51:
movq b(%rip), %rax
addq $23, %rax
jmp .L43
.p2align 4,,10
.p2align 3
.L52:
movq b(%rip), %rax
addq $19, %rax
jmp .L43
.p2align 4,,10
.p2align 3
.L53:
movq b(%rip), %rax
addq $17, %rax
jmp .L43
.p2align 4,,10
.p2align 3
.L54:
movq b(%rip), %rax
addq $13, %rax
jmp .L43
.p2align 4,,10
.p2align 3
.L45:
movq b(%rip), %rax
addq $43, %rax
jmp .L43
Yes gcc did catch __builtin_unreachable
, but for some reason, there is an unnecessary jump before each return, and the jump table has a duplicate entry of L45
. Also instead of simply addq $N, b(%rip)
, it keeps writing movq b(%rip), %rax
, addq $N, %rax
, then movq %rax, b(%rip)
before return.
What has made gcc produce apparently dumb code?
The binary was compiled with -O3
under Fedora Linux, and the gcc version I'm using is 11.2.1 20211203