I wrote a little test and compile it with gcc -O4
optimization.
Here is add_index_mod
and add_index_if
implementations from this test:
void add_index_mod(int *p) {
*p = (*p + 1) % 10;
}
void add_index_if(int *p) {
if (*p == 9)
*p = 0;
else
(*p)++;
}
And that's what I got for add_index_mod
:
mov eax, dword [rdi]
mov edx, 0x66666667
lea ecx, dword [rax + 1]
mov eax, ecx
imul edx
mov eax, ecx
sar eax, 0x1f
sar edx, 2
sub edx, eax
lea eax, dword [rdx + rdx*4]
add eax, eax
sub ecx, eax
mov dword [rdi], ecx
ret
Here we can see that the compiler replaced div with sequence of mul, shifts and subs. This trick is well described here.
And that's what I got for add_index_if
:
mov edx, dword [rdi]
lea eax, dword [rdx + 1]
cmp edx, 9
mov edx, 0
cmove eax, edx
mov dword [rdi], eax
ret
Nothing special here just cmp and conditional mov.
So now you can try to calculate the efficiency of assembly code of both this
functions using this table. But this is not the best way to go because of out of order execution, branch prediction and etc.
So as I mentioned above I just wrote a little test:
#include <stdio.h>
#include <stdint.h>
#define REPEATS (1 << 30)
static inline uint64_t rdtsc() {
unsigned int hi, lo;
__asm__ volatile("rdtsc" : "=a" (lo), "=d" (hi));
return ((uint64_t)hi << 32) | lo;
}
void add_index_mod(int *p) {
*p = (*p + 1) % 10;
}
void add_index_if(int *p) {
if (*p == 9)
*p = 0;
else
(*p)++;
}
int main() {
int p = 0;
uint32_t i;
uint64_t start, stop;
double delta, ticks_per_call;
// mod ================================
start = rdtsc();
for (i = 0; i < REPEATS; ++i) {
add_index_mod(&p);
}
stop = rdtsc();
// gcc with -O4 can remove above loop
// if we don't use its result so print it
printf("%d\n", p);
delta = (double)(stop - start);
ticks_per_call = delta / REPEATS;
printf("add_index_mod: %f\n", ticks_per_call);
// if ================================
start = rdtsc();
for (i = 0; i < REPEATS; ++i) {
add_index_if(&p);
}
stop = rdtsc();
printf("%d\n", p);
delta = (double)(stop - start);
ticks_per_call = delta / REPEATS;
printf("add_index_if: %f\n", ticks_per_call);
return 0;
}
And here is its output for my Intel core i5-6500:
add_index_mod: 9.643092
add_index_if: 2.063125
So for huge number of calls add_index_if
5 times faster than add_index_mod
on my CPU.