I'm doing some experiments with the following simple code to find out what compiler does when it optimizes a if-else
structure. The code i'm using is
#include <stdio.h>
int main() {
int arr[] = {1,2,3,4,5,6,7};
int i;
for(i = 0; i < 5; i++) {
if(arr[i] == 1)
printf("one\n");
else if (arr[i] == 2)
printf("two\n");
else if (arr[i] = 3)
printf("three\n");
else printf("blah\n");
}
return 0;
}
Surely its not very good example. As there is nothing dynamic here to distinguish likely and unlikely brunch.
But to my surprise the code it generated is vastly different.
First without any optimization i have :
0x0000000000400506 <+66>: mov eax,DWORD PTR [rbp-0x4]
0x0000000000400509 <+69>: cdqe
0x000000000040050b <+71>: mov eax,DWORD PTR [rbp+rax*4-0x20]
0x000000000040050f <+75>: cmp eax,0x1
0x0000000000400512 <+78>: jne 0x400520 <main+92>
0x0000000000400514 <+80>: mov edi,0x400668
0x0000000000400519 <+85>: call 0x4003b8 <puts@plt>
0x000000000040051e <+90>: jmp 0x400551 <main+141>
0x0000000000400520 <+92>: mov eax,DWORD PTR [rbp-0x4]
0x0000000000400523 <+95>: cdqe
0x0000000000400525 <+97>: mov eax,DWORD PTR [rbp+rax*4-0x20]
0x0000000000400529 <+101>: cmp eax,0x2
0x000000000040052c <+104>: jne 0x40053a <main+118>
0x000000000040052e <+106>: mov edi,0x40066c
0x0000000000400533 <+111>: call 0x4003b8 <puts@plt>
0x0000000000400538 <+116>: jmp 0x400551 <main+141>
0x000000000040053a <+118>: mov eax,DWORD PTR [rbp-0x4]
0x000000000040053d <+121>: cdqe
0x000000000040053f <+123>: mov DWORD PTR [rbp+rax*4-0x20],0x3
0x0000000000400547 <+131>: mov edi,0x400670
0x000000000040054c <+136>: call 0x4003b8 <puts@plt>
The code is pretty straight forward. sequential cmp
and jne
is the heart of the if-else
structure as expected.
But the fun begins with (-O3
)
0x0000000000400510 <+64>: call 0x4003b8 <puts@plt>
0x0000000000400515 <+69>: mov eax,DWORD PTR [rsp+0x4]
0x0000000000400519 <+73>: cmp eax,0x1
0x000000000040051c <+76>: je 0x400640 <main+368>
0x0000000000400522 <+82>: cmp eax,0x2
0x0000000000400525 <+85>: je 0x4005a0 <main+208>
0x0000000000400527 <+87>: mov edi,0x40074c
0x000000000040052c <+92>: mov DWORD PTR [rsp+0x4],0x3
0x0000000000400534 <+100>: call 0x4003b8 <puts@plt>
0x0000000000400539 <+105>: mov eax,DWORD PTR [rsp+0x8]
0x000000000040053d <+109>: cmp eax,0x1
0x0000000000400540 <+112>: je 0x4005b3 <main+227>
0x0000000000400542 <+114>: cmp eax,0x2
0x0000000000400545 <+117>: je 0x400630 <main+352>
0x000000000040054b <+123>: mov edi,0x40074c
0x0000000000400550 <+128>: mov DWORD PTR [rsp+0x8],0x3
0x0000000000400558 <+136>: call 0x4003b8 <puts@plt>
0x000000000040055d <+141>: mov eax,DWORD PTR [rsp+0xc]
0x0000000000400561 <+145>: cmp eax,0x1
0x0000000000400564 <+148>: je 0x4005d0 <main+256>
0x0000000000400566 <+150>: cmp eax,0x2
0x0000000000400569 <+153>: je 0x400618 <main+328>
0x000000000040056f <+159>: mov edi,0x40074c
0x0000000000400574 <+164>: mov DWORD PTR [rsp+0xc],0x3
0x000000000040057c <+172>: call 0x4003b8 <puts@plt>
0x0000000000400581 <+177>: mov eax,DWORD PTR [rsp+0x10]
0x0000000000400585 <+181>: cmp eax,0x1
0x0000000000400588 <+184>: je 0x4005e8 <main+280>
0x000000000040058a <+186>: cmp eax,0x2
0x000000000040058d <+189>: je 0x400600 <main+304>
0x000000000040058f <+191>: mov edi,0x40074c
0x0000000000400594 <+196>: call 0x4003b8 <puts@plt>
0x0000000000400599 <+201>: xor eax,eax
0x000000000040059b <+203>: add rsp,0x28
0x000000000040059f <+207>: ret
0x00000000004005a0 <+208>: mov edi,0x400752
0x00000000004005a5 <+213>: call 0x4003b8 <puts@plt>
0x00000000004005aa <+218>: mov eax,DWORD PTR [rsp+0x8]
0x00000000004005ae <+222>: cmp eax,0x1
0x00000000004005b1 <+225>: jne 0x400542 <main+114>
0x00000000004005b3 <+227>: mov edi,0x400748
0x00000000004005b8 <+232>: call 0x4003b8 <puts@plt>
0x00000000004005bd <+237>: mov eax,DWORD PTR [rsp+0xc]
0x00000000004005c1 <+241>: cmp eax,0x1
0x00000000004005c4 <+244>: jne 0x400566 <main+150>
0x00000000004005c6 <+246>: nop WORD PTR cs:[rax+rax*1+0x0]
0x00000000004005d0 <+256>: mov edi,0x400748
0x00000000004005d5 <+261>: call 0x4003b8 <puts@plt>
0x00000000004005da <+266>: mov eax,DWORD PTR [rsp+0x10]
0x00000000004005de <+270>: cmp eax,0x1
0x00000000004005e1 <+273>: jne 0x40058a <main+186>
0x00000000004005e3 <+275>: nop DWORD PTR [rax+rax*1+0x0]
0x00000000004005e8 <+280>: mov edi,0x400748
0x00000000004005ed <+285>: call 0x4003b8 <puts@plt>
0x00000000004005f2 <+290>: xor eax,eax
0x00000000004005f4 <+292>: add rsp,0x28
0x00000000004005f8 <+296>: ret
0x00000000004005f9 <+297>: nop DWORD PTR [rax+0x0]
0x0000000000400600 <+304>: mov edi,0x400752
0x0000000000400605 <+309>: call 0x4003b8 <puts@plt>
0x000000000040060a <+314>: xor eax,eax
0x000000000040060c <+316>: add rsp,0x28
0x0000000000400610 <+320>: ret
0x0000000000400611 <+321>: nop DWORD PTR [rax+0x0]
0x0000000000400618 <+328>: mov edi,0x400752
0x000000000040061d <+333>: call 0x4003b8 <puts@plt>
0x0000000000400622 <+338>: jmp 0x400581 <main+177>
0x0000000000400627 <+343>: nop WORD PTR [rax+rax*1+0x0]
0x0000000000400630 <+352>: mov edi,0x400752
0x0000000000400635 <+357>: call 0x4003b8 <puts@plt>
0x000000000040063a <+362>: jmp 0x40055d <main+141>
0x000000000040063f <+367>: nop
0x0000000000400640 <+368>: mov edi,0x400748
0x0000000000400645 <+373>: call 0x4003b8 <puts@plt>
0x000000000040064a <+378>: jmp 0x400539 <main+105>
Important thing to note here :
A lot of unconditional jumps to move around the code.
Using je
instead of jne
.
There are lot of duplicated code regions. comparison with 1
is done multiple times.
I will dig into the optimized assembler more and keep this post updated for any interesting find. This is not so much of answer, but an investigation and also invitation for others to do similar kind of investigation to find out important optimization practices.
EDIT:
Compiler info :
[root@s1 ~]# gcc --version
gcc (GCC) 4.4.6 20110731 (Red Hat 4.4.6-3)
Optimization info :
-O2 turns on the following optimization flags :
-fthread-jumps
-falign-functions -falign-jumps
-falign-loops -falign-labels
-fcaller-saves
-fcrossjumping
-fcse-follow-jumps -fcse-skip-blocks
-fdelete-null-pointer-checks
-fdevirtualize
-fexpensive-optimizations
-fgcse -fgcse-lm
-fhoist-adjacent-loads
-finline-small-functions
-findirect-inlining
-fipa-sra
-foptimize-sibling-calls
-fpartial-inlining
-fpeephole2
-fregmove
-freorder-blocks -freorder-functions
-frerun-cse-after-loop
-fsched-interblock -fsched-spec
-fschedule-insns -fschedule-insns2
-fstrict-aliasing -fstrict-overflow
-ftree-switch-conversion -ftree-tail-merge
-ftree-pre
-ftree-vrp
-O3 will add additional optimization with -O2 :
-finline-functions, -funswitch-loops, -fpredictive-commoning, -fgcse-after-reload, -ftree-vectorize, -fvect-cost-model, -ftree-partial-pre and -fipa-cp-clone
if-else
block related optimizations :
-fcse-follow-jumps
In common subexpression elimination (CSE), scan through jump instructions when the target of the jump is not reached by any other
path. For example, when CSE encounters an if statement with an else
clause, CSE follows the jump when the condition tested is false.
-fcse-skip-blocks
This is similar to -fcse-follow-jumps, but causes CSE to follow jumps that conditionally skip over blocks. When CSE encounters a
simple if statement with no else clause, -fcse-skip-blocks causes CSE
to follow the jump around the body of the if.
fhoist-adjacent-loads
Speculatively hoist loads from both branches of an if-then-else if the loads are from adjacent locations in the same structure and the
target architecture has a conditional move instruction. This flag is
enabled by default at -O2 and higher.