I was looking at the tricks How to multiply two quaternions with minimal instructions? employed and was dismayed at the inferiority of my gcc implementation:
template <typename T> struct quat;
template <> struct quat<float> { float __attribute__((vector_size(16))) data_; };
inline quat<float> operator*(quat<float> const& l, quat<float> const& r) noexcept
{
// l(0)r(3) + l(1)r(2) - l(2)r(1) + l(3)r(0)
// -l(0)r(2) + l(1)r(3) + l(2)r(0) + l(3)r(1)
// l(0)r(1) - l(1)r(0) + l(2)r(3) + l(3)r(2)
// -l(0)r(0) - l(1)r(1) - l(2)r(2) + l(3)r(3)
using int_value_type = ::std::int32_t;
using int_vector_type = ::std::int32_t __attribute__((vector_size(16)));
auto const t1(
l.data_ *
__builtin_shuffle(r.data_, int_vector_type{3, 3, 3, 3})
);
auto const t2(
__builtin_shuffle(l.data_, int_vector_type{1, 2, 0, 2}) *
__builtin_shuffle(r.data_, int_vector_type{2, 0, 1, 2})
);
auto const t3(
__builtin_shuffle(l.data_, int_vector_type{3, 3, 3, 1}) *
__builtin_shuffle(r.data_, int_vector_type{0, 1, 2, 1})
);
auto const t4(
__builtin_shuffle(l.data_, int_vector_type{2, 0, 1, 0}) *
__builtin_shuffle(r.data_, int_vector_type{1, 2, 0, 0})
);
constexpr int_vector_type const mask{
0, 0, 0, 1 << 8 * sizeof(int_value_type) - 1
};
return {
t1 +
decltype(t2)(int_vector_type(t2) ^ mask) +
decltype(t3)(int_vector_type(t3) ^ mask) -
t4
};
}
This makes for 4 MULs, 2 ADDs, 1 SUB and 2 XORs. Is there some trick in gcc vector extensions or math, that I've missed, which could have brought my implementation closer to the more optimal one?
EDIT: asm code with -O3 -ffast-math
Dump of assembler code for function vxl::operator*<float>(vxl::quat<float> const&, vxl::quat<float> const&):
0x0000000000400980 <+0>: movaps (%rsi),%xmm2
0x0000000000400983 <+3>: movaps (%rdi),%xmm1
0x0000000000400986 <+6>: movaps %xmm2,%xmm0
0x0000000000400989 <+9>: movaps %xmm2,%xmm4
0x000000000040098c <+12>: movaps %xmm1,%xmm3
0x000000000040098f <+15>: shufps $0xff,%xmm2,%xmm0
0x0000000000400993 <+19>: shufps $0x9,%xmm2,%xmm4
0x0000000000400997 <+23>: shufps $0x12,%xmm1,%xmm3
0x000000000040099b <+27>: mulps %xmm1,%xmm0
0x000000000040099e <+30>: mulps %xmm4,%xmm3
0x00000000004009a1 <+33>: movaps %xmm0,%xmm4
0x00000000004009a4 <+36>: movaps %xmm1,%xmm0
0x00000000004009a7 <+39>: subps %xmm3,%xmm4
0x00000000004009aa <+42>: movaps %xmm2,%xmm3
0x00000000004009ad <+45>: shufps $0x7f,%xmm1,%xmm0
0x00000000004009b1 <+49>: shufps $0x64,%xmm2,%xmm3
0x00000000004009b5 <+53>: shufps $0x89,%xmm1,%xmm1
0x00000000004009b9 <+57>: shufps $0x92,%xmm2,%xmm2
0x00000000004009bd <+61>: mulps %xmm3,%xmm0
0x00000000004009c0 <+64>: movaps 0xa9(%rip),%xmm3 # 0x400a70
0x00000000004009c7 <+71>: mulps %xmm2,%xmm1
0x00000000004009ca <+74>: xorps %xmm3,%xmm0
0x00000000004009cd <+77>: xorps %xmm3,%xmm1
0x00000000004009d0 <+80>: addps %xmm0,%xmm1
0x00000000004009d3 <+83>: addps %xmm1,%xmm4
0x00000000004009d6 <+86>: movaps %xmm4,%xmm0
0x00000000004009d9 <+89>: retq
EDIT2: with Marc's suggestion and clang, the code becomes:
Dump of assembler code for function vxl::operator*<float>(vxl::quat<float> const&, vxl::quat<float> const&):
0x0000000000400ae0 <+0>: movaps (%rdi),%xmm1
0x0000000000400ae3 <+3>: movdqa (%rsi),%xmm2
0x0000000000400ae7 <+7>: pshufd $0xff,%xmm2,%xmm3
0x0000000000400aec <+12>: mulps %xmm1,%xmm3
0x0000000000400aef <+15>: pshufd $0x89,%xmm1,%xmm0
0x0000000000400af4 <+20>: pshufd $0x92,%xmm2,%xmm4
0x0000000000400af9 <+25>: mulps %xmm0,%xmm4
0x0000000000400afc <+28>: pshufd $0x7f,%xmm1,%xmm5
0x0000000000400b01 <+33>: pshufd $0x64,%xmm2,%xmm0
0x0000000000400b06 <+38>: mulps %xmm5,%xmm0
0x0000000000400b09 <+41>: pshufd $0x12,%xmm1,%xmm1
0x0000000000400b0e <+46>: pshufd $0x9,%xmm2,%xmm2
0x0000000000400b13 <+51>: mulps %xmm1,%xmm2
0x0000000000400b16 <+54>: addps %xmm4,%xmm0
0x0000000000400b19 <+57>: xorps 0xc0(%rip),%xmm0 # 0x400be0
0x0000000000400b20 <+64>: addps %xmm3,%xmm0
0x0000000000400b23 <+67>: subps %xmm2,%xmm0
0x0000000000400b26 <+70>: retq
This is not bad.