Most compilers will convert your switch into a lookup table or jump table as if it were similar to the following code:
char lut_func(char c){
static const char lut[256] = {
['0']=0x40, ['1']=0x41, ['2']=0x42, ['3']=0x43,
['4']=0x44, ['5']=0x45, ['6']=0x46, ['7']=0x47,
['8']=0x48, ['9']=0x49, ['a']=0x4a, ['b']=0x4b,
['c']=0x4c, ['d']=0x4d, ['e']=0x4e, ['f']=0x4f,
/* everything else is set to 0 automatically */
};
return lut[(unsigned char)c];
}
The only problems with this:
- cannot vectorize
- the common? data (0-9,a-f) spans 2 64 byte data cache lines
You can remedy the cache line misses by properly aligning and offsetting the data (your compiler may be able to do this if you profile your code) something like:
char lut_func(char c){
static const char __attribute__((aligned(64)))lut_data[256+16] = {
['0'+16]=0x40, ['1'+16]=0x41, ['2'+16]=0x42, ['3'+16]=0x43,
['4'+16]=0x44, ['5'+16]=0x45, ['6'+16]=0x46, ['7'+16]=0x47,
['8'+16]=0x48, ['9'+16]=0x49, ['a'+16]=0x4a, ['b'+16]=0x4b,
['c'+16]=0x4c, ['d'+16]=0x4d, ['e'+16]=0x4e, ['f'+16]=0x4f,
/* everything else is set to 0 automatically */
};
char lut = lut_data+16;
return lut[(unsigned char)c];
}
Its hard to say if this will help much since neither the makeup of the data nor the benchmark was included.
The hand written SSE2 code (though clever) unfortunately contains non-SSE2 code that slows down the code and makes it difficult to auto-vectorize (__builtin_ctz
, if
and the char array access) especially if you are limited to SSE2. This is just less efficient than a single data access when the data is already "hot". It may still be worth using the SSE2 version if its infrequently called, but if that were the case you wouldn't need to optimize it.
If you can access the data sequentially, you can use vector extensions to get SIMD code something like this:
//this vector extension syntax requires gcc or clang versions 5+
typedef __INT8_TYPE__ i8x16 __attribute__ ((__vector_size__ (16), aligned(16), __may_alias__));
i8x16 vec_func(i8x16 c){
i8x16 is09 = (c>='0') & (c<='9');
i8x16 isaf = (c>='a') & (c<='f');
return (c & (is09 | isaf)) + (16 & is09) - (23 & isaf);
}
Compiled on architectures with SIMD instructions (x86_64, arm+neon, ppc+altivec, etc..) this compiles to ~20 instructions and accesses around 80 bytes of data to compute 16 sequential characters (with AVX2 you can do 32 with minimal modification)
For example compilation with generic x86_64 yields:
vec_func: # @lu16
movdqa xmm1, xmm0
pcmpgtb xmm1, xmmword ptr [rip + .LCPI0_0]
movdqa xmm2, xmmword ptr [rip + .LCPI0_1] # xmm2 = [58,58,58,58,58,58,58,58,58,58,58,58,58,58,58,58]
pcmpgtb xmm2, xmm0
movdqa xmm3, xmm0
pcmpgtb xmm3, xmmword ptr [rip + .LCPI0_2]
pand xmm2, xmm1
movdqa xmm1, xmmword ptr [rip + .LCPI0_3] # xmm1 = [103,103,103,103,103,103,103,103,103,103,103,103,103,103,103,103]
pcmpgtb xmm1, xmm0
pand xmm1, xmm3
movdqa xmm3, xmm2
por xmm3, xmm1
pand xmm3, xmm0
pand xmm2, xmmword ptr [rip + .LCPI0_4]
pand xmm1, xmmword ptr [rip + .LCPI0_5]
por xmm1, xmm2
paddb xmm1, xmm3
movdqa xmm0, xmm1
ret
or with AVX2 enabled
vec_func:
vpcmpgtb xmm1, xmm0, xmmword ptr [rip + .LCPI0_0]
vmovdqa xmm2, xmmword ptr [rip + .LCPI0_1] # xmm2 = [58,58,58,58,58,58,58,58,58,58,58,58,58,58,58,58]
vpcmpgtb xmm2, xmm2, xmm0
vpcmpgtb xmm3, xmm0, xmmword ptr [rip + .LCPI0_2]
vpand xmm1, xmm1, xmm2
vmovdqa xmm2, xmmword ptr [rip + .LCPI0_3] # xmm2 = [103,103,103,103,103,103,103,103,103,103,103,103,103,103,103,103]
vpcmpgtb xmm2, xmm2, xmm0
vpand xmm2, xmm3, xmm2
vpor xmm3, xmm1, xmm2
vpand xmm0, xmm3, xmm0
vpand xmm1, xmm1, xmmword ptr [rip + .LCPI0_4]
vpand xmm2, xmm2, xmmword ptr [rip + .LCPI0_5]
vpor xmm1, xmm2, xmm1
vpaddb xmm0, xmm1, xmm0
ret
and aarch64
vec_func:
movi v2.16b, 0x61
movi v4.16b, 0x66
movi v1.16b, 0x30
movi v5.16b, 0x39
cmge v3.16b, v0.16b, v2.16b
cmge v2.16b, v4.16b, v0.16b
cmge v1.16b, v0.16b, v1.16b
cmge v5.16b, v5.16b, v0.16b
movi v4.16b, 0x10
and v2.16b, v3.16b, v2.16b
and v1.16b, v1.16b, v5.16b
movi v5.16b, 0x17
and v3.16b, v1.16b, v4.16b
orr v1.16b, v1.16b, v2.16b
and v2.16b, v2.16b, v5.16b
and v1.16b, v1.16b, v0.16b
add v1.16b, v1.16b, v3.16b
sub v0.16b, v1.16b, v2.16b
ret
or power9
vec_func:
xxspltib 35, 47
xxspltib 36, 58
vcmpgtsb 3, 2, 3
vcmpgtsb 4, 4, 2
xxland 0, 35, 36
xxspltib 35, 96
xxspltib 36, 103
vcmpgtsb 3, 2, 3
vcmpgtsb 4, 4, 2
xxland 1, 35, 36
xxlor 2, 0, 1
xxlxor 3, 3, 3
xxsel 34, 3, 34, 2
xxspltib 2, 16
xxsel 35, 3, 2, 0
xxspltib 0, 233
xxsel 36, 3, 0, 1
xxlor 35, 36, 35
vaddubm 2, 3, 2
blr