I like chux's approach of using a lookup table. In the source below, this is swchux
.
It generates:
movzbl %dil,%edi
movzbl 0(%rdi),%eax
And, then 4-5 of:
cmp $x,%al
jxx ...
This is really fast with a limited number of case
statements.
But, with a larger number of case
statements, the cmp/jxx
entries take up a significant amount of time.
I had a situation where there was a switch/case
block with a hundred or so entries. So, this didn't scale.
By using a computed goto (using &&label
), we can reduce this to (in swfix1
):
movzbl %dil,%edi
movzbl 0(%rdi),%eax
jmp *tbl(,%rax,8)
For the use case I had, using the computed goto instead of the switch
improved overall performance by 30%.
With some cpp
macros, we can make the syntax similar to a switch/case
block.
In the above examples, we're using an unsigned char
lookup. If we use a direct label table, we can reduce this by one instruction (in swfix2
):
movzbl %dil,%edi
jmpq *0x0(,%rdi,8)
This eliminates one asm instruction at the expense of the lookup table using 8 bytes / entry (vs. 1 byte for the above).
Here is the .c
source code for the above examples.
Note that here I just used the DOIT
macro as a placeholder for the actual code in the case
. In real code, each case
would have its own/different code.
#include <limits.h>
int state;
#define DOIT(val_) \
state = 256 + val_
static const unsigned char type[UCHAR_MAX + 1u] = { //
['A'] = 1,['B'] = 1,['C'] = 1,['D'] = 1,['E'] = 1, //
['F'] = 1,['G'] = 1,['H'] = 1,['I'] = 1,['J'] = 1, //
['K'] = 1,['L'] = 1,['M'] = 1,['N'] = 1,['O'] = 1, //
['P'] = 1,['Q'] = 1,['R'] = 1,['S'] = 1,['T'] = 1, //
['U'] = 1,['V'] = 1,['W'] = 1,['X'] = 1,['Y'] = 1, //
['Z'] = 1, //
['a'] = 1,['b'] = 1,['c'] = 1,['d'] = 1,['e'] = 1, //
['f'] = 1,['g'] = 1,['h'] = 1,['i'] = 1,['j'] = 1, //
['k'] = 1,['l'] = 1,['m'] = 1,['n'] = 1,['o'] = 1, //
['p'] = 1,['q'] = 1,['r'] = 1,['s'] = 1,['t'] = 1, //
['u'] = 1,['v'] = 1,['w'] = 1,['x'] = 1,['y'] = 1, //
['z'] = 1, //
['\n'] = 2, //
['^'] = 3, //
['&'] = 4, //
// Other elements are 0 since they are not explicitly initialized.
};
void
swchux(unsigned char token)
{
switch (type[token]) {
case 1:
DOIT(1);
break; // letters
case 2:
DOIT(2);
break; // \n
case 3:
DOIT(3);
break; // ^
case 4:
DOIT(4);
break; // &
default: // None of the above.
DOIT(0);
break;
}
}
#define CASE(idx_) \
CASE_##idx_
#define V(case_) \
&&CASE(case_)
#undef SWITCH
#define SWITCH(idx_) \
goto *swvec[idx_]
void
swfix1(unsigned char token)
{
static void *swvec[5] = {
V(0),
V(1),
V(2),
V(3),
V(4),
};
do {
SWITCH(type[token]);
CASE(1):
DOIT(1);
break; // letters
CASE(2):
DOIT(2);
break; // \n
CASE(3):
DOIT(3);
break; // ^
CASE(4):
DOIT(4);
break; // &
CASE(0):
DOIT(0);
break;
} while (0);
}
#undef SWITCH
#define SWITCH(idx_) \
goto *swvec[idx_]
void
swfix2(unsigned char token)
{
static const void *swvec[UCHAR_MAX + 1u] = { //
['A'] = V(1),['B'] = V(1),['C'] = V(1),['D'] = V(1),['E'] = V(1), //
['F'] = V(1),['G'] = V(1),['H'] = V(1),['I'] = V(1),['J'] = V(1), //
['K'] = V(1),['L'] = V(1),['M'] = V(1),['N'] = V(1),['O'] = V(1), //
['P'] = V(1),['Q'] = V(1),['R'] = V(1),['S'] = V(1),['T'] = V(1), //
['U'] = V(1),['V'] = V(1),['W'] = V(1),['X'] = V(1),['Y'] = V(1), //
['Z'] = V(1), //
['a'] = V(1),['b'] = V(1),['c'] = V(1),['d'] = V(1),['e'] = V(1), //
['f'] = V(1),['g'] = V(1),['h'] = V(1),['i'] = V(1),['j'] = V(1), //
['k'] = V(1),['l'] = V(1),['m'] = V(1),['n'] = V(1),['o'] = V(1), //
['p'] = V(1),['q'] = V(1),['r'] = V(1),['s'] = V(1),['t'] = V(1), //
['u'] = V(1),['v'] = V(1),['w'] = V(1),['x'] = V(1),['y'] = V(1), //
['z'] = V(1), //
['\n'] = V(2), //
['^'] = V(3), //
['&'] = V(4), //
// Other elements are 0 since they are not explicitly initialized.
};
do {
SWITCH(token);
CASE(1):
DOIT(1);
break; // letters
CASE(2):
DOIT(2);
break; // \n
CASE(3):
DOIT(3);
break; // ^
CASE(4):
DOIT(4);
break; // &
CASE(0):
DOIT(0);
break;
} while (0);
}
Here is the source built with -S
:
.file "all.c"
.text
.p2align 4,,15
.globl swchux
.type swchux, @function
swchux:
.LFB0:
.cfi_startproc
movzbl %dil, %edi
movzbl type(%rdi), %eax
cmpb $2, %al
je .L2
jbe .L10
cmpb $3, %al
je .L6
cmpb $4, %al
jne .L5
movl $260, state(%rip)
ret
.p2align 4,,10
.p2align 3
.L10:
cmpb $1, %al
jne .L5
movl $257, state(%rip)
ret
.p2align 4,,10
.p2align 3
.L2:
movl $258, state(%rip)
ret
.p2align 4,,10
.p2align 3
.L5:
movl $256, state(%rip)
ret
.p2align 4,,10
.p2align 3
.L6:
movl $259, state(%rip)
ret
.cfi_endproc
.LFE0:
.size swchux, .-swchux
.p2align 4,,15
.globl swfix1
.type swfix1, @function
swfix1:
.LFB1:
.cfi_startproc
movzbl %dil, %edi
movzbl type(%rdi), %eax
jmp *swvec.1969(,%rax,8)
.p2align 4,,10
.p2align 3
.L17:
movl $256, state(%rip)
ret
.p2align 4,,10
.p2align 3
.L16:
movl $260, state(%rip)
ret
.p2align 4,,10
.p2align 3
.L15:
movl $259, state(%rip)
ret
.p2align 4,,10
.p2align 3
.L14:
movl $258, state(%rip)
ret
.p2align 4,,10
.p2align 3
.L12:
movl $257, state(%rip)
ret
.cfi_endproc
.LFE1:
.size swfix1, .-swfix1
.p2align 4,,15
.globl swfix2
.type swfix2, @function
swfix2:
.LFB2:
.cfi_startproc
movzbl %dil, %edi
jmp *swvec.1979(,%rdi,8)
.p2align 4,,10
.p2align 3
.L23:
movl $260, state(%rip)
ret
.p2align 4,,10
.p2align 3
.L22:
movl $259, state(%rip)
ret
.p2align 4,,10
.p2align 3
.L21:
movl $258, state(%rip)
ret
.p2align 4,,10
.p2align 3
.L19:
movl $257, state(%rip)
ret
.cfi_endproc
.LFE2:
.size swfix2, .-swfix2
.section .rodata
.align 32
.type swvec.1979, @object
.size swvec.1979, 2048
swvec.1979:
.zero 80
.quad .L21
.zero 216
.quad .L23
.zero 208
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.zero 24
.quad .L22
.zero 16
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.quad .L19
.zero 1064
.align 32
.type swvec.1969, @object
.size swvec.1969, 40
swvec.1969:
.quad .L17
.quad .L12
.quad .L14
.quad .L15
.quad .L16
.align 32
.type type, @object
.size type, 256
type:
.zero 10
.byte 2
.zero 27
.byte 4
.zero 26
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.zero 3
.byte 3
.zero 2
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.byte 1
.zero 133
.comm state,4,4
.ident "GCC: (GNU) 8.3.1 20190223 (Red Hat 8.3.1-2)"
.section .note.GNU-stack,"",@progbits