Consider the following code
typedef unsigned uint;
uint parity( uint64_t x )
{
uint32_t v = x ^ (x >> 32);
v ^= v >> 16;
v ^= v >> 8;
v ^= v >> 4;
v ^= v >> 2;
return (uint)(v ^ (v >> 1)) & 1;
}
Is there a way of radically reorganising this code to get a serious improvement due to instruction-level parallelism on say an Intel x86-64 machine?
GCC produced the following code
parity(unsigned long):
mov rax, rdi
shr rax, 32
xor eax, edi
mov edi, eax
shr edi, 16
xor eax, edi
mov edi, eax
shr edi, 8
xor eax, edi
mov edi, eax
shr edi, 4
xor eax, edi
mov edi, eax
shr edi, 2
xor eax, edi
mov edx, eax
shr eax
xor eax, edx
and eax, 1
ret