quaternion multiplication with gcc vector extensions

Question

I was looking at the tricks How to multiply two quaternions with minimal instructions? employed and was dismayed at the inferiority of my gcc implementation:

template <typename T> struct quat;

template <> struct quat<float> { float __attribute__((vector_size(16))) data_; };

inline quat<float> operator*(quat<float> const& l, quat<float> const& r) noexcept
{
//  l(0)r(3) + l(1)r(2) - l(2)r(1) + l(3)r(0)
// -l(0)r(2) + l(1)r(3) + l(2)r(0) + l(3)r(1)
//  l(0)r(1) - l(1)r(0) + l(2)r(3) + l(3)r(2)
// -l(0)r(0) - l(1)r(1) - l(2)r(2) + l(3)r(3)
  using int_value_type = ::std::int32_t;
  using int_vector_type = ::std::int32_t __attribute__((vector_size(16)));

  auto const t1(
    l.data_ *
    __builtin_shuffle(r.data_, int_vector_type{3, 3, 3, 3})
  );
  auto const t2(
    __builtin_shuffle(l.data_, int_vector_type{1, 2, 0, 2}) *
    __builtin_shuffle(r.data_, int_vector_type{2, 0, 1, 2})
  );
  auto const t3(
    __builtin_shuffle(l.data_, int_vector_type{3, 3, 3, 1}) *
    __builtin_shuffle(r.data_, int_vector_type{0, 1, 2, 1})
  );
  auto const t4(
    __builtin_shuffle(l.data_, int_vector_type{2, 0, 1, 0}) *
    __builtin_shuffle(r.data_, int_vector_type{1, 2, 0, 0})
  );

  constexpr int_vector_type const mask{
    0, 0, 0, 1 << 8 * sizeof(int_value_type) - 1
  };

  return {
    t1 +
    decltype(t2)(int_vector_type(t2) ^ mask) +
    decltype(t3)(int_vector_type(t3) ^ mask) -
    t4
  };
}

This makes for 4 MULs, 2 ADDs, 1 SUB and 2 XORs. Is there some trick in gcc vector extensions or math, that I've missed, which could have brought my implementation closer to the more optimal one?

EDIT: asm code with -O3 -ffast-math

Dump of assembler code for function vxl::operator*<float>(vxl::quat<float> const&, vxl::quat<float> const&):
   0x0000000000400980 <+0>:     movaps (%rsi),%xmm2
   0x0000000000400983 <+3>:     movaps (%rdi),%xmm1
   0x0000000000400986 <+6>:     movaps %xmm2,%xmm0
   0x0000000000400989 <+9>:     movaps %xmm2,%xmm4
   0x000000000040098c <+12>:    movaps %xmm1,%xmm3
   0x000000000040098f <+15>:    shufps $0xff,%xmm2,%xmm0
   0x0000000000400993 <+19>:    shufps $0x9,%xmm2,%xmm4
   0x0000000000400997 <+23>:    shufps $0x12,%xmm1,%xmm3
   0x000000000040099b <+27>:    mulps  %xmm1,%xmm0
   0x000000000040099e <+30>:    mulps  %xmm4,%xmm3
   0x00000000004009a1 <+33>:    movaps %xmm0,%xmm4
   0x00000000004009a4 <+36>:    movaps %xmm1,%xmm0
   0x00000000004009a7 <+39>:    subps  %xmm3,%xmm4
   0x00000000004009aa <+42>:    movaps %xmm2,%xmm3
   0x00000000004009ad <+45>:    shufps $0x7f,%xmm1,%xmm0
   0x00000000004009b1 <+49>:    shufps $0x64,%xmm2,%xmm3
   0x00000000004009b5 <+53>:    shufps $0x89,%xmm1,%xmm1
   0x00000000004009b9 <+57>:    shufps $0x92,%xmm2,%xmm2
   0x00000000004009bd <+61>:    mulps  %xmm3,%xmm0
   0x00000000004009c0 <+64>:    movaps 0xa9(%rip),%xmm3        # 0x400a70
   0x00000000004009c7 <+71>:    mulps  %xmm2,%xmm1
   0x00000000004009ca <+74>:    xorps  %xmm3,%xmm0
   0x00000000004009cd <+77>:    xorps  %xmm3,%xmm1
   0x00000000004009d0 <+80>:    addps  %xmm0,%xmm1
   0x00000000004009d3 <+83>:    addps  %xmm1,%xmm4
   0x00000000004009d6 <+86>:    movaps %xmm4,%xmm0
   0x00000000004009d9 <+89>:    retq

EDIT2: with Marc's suggestion and clang, the code becomes:

Dump of assembler code for function vxl::operator*<float>(vxl::quat<float> const&, vxl::quat<float> const&):
   0x0000000000400ae0 <+0>:     movaps (%rdi),%xmm1
   0x0000000000400ae3 <+3>:     movdqa (%rsi),%xmm2
   0x0000000000400ae7 <+7>:     pshufd $0xff,%xmm2,%xmm3
   0x0000000000400aec <+12>:    mulps  %xmm1,%xmm3
   0x0000000000400aef <+15>:    pshufd $0x89,%xmm1,%xmm0
   0x0000000000400af4 <+20>:    pshufd $0x92,%xmm2,%xmm4
   0x0000000000400af9 <+25>:    mulps  %xmm0,%xmm4
   0x0000000000400afc <+28>:    pshufd $0x7f,%xmm1,%xmm5
   0x0000000000400b01 <+33>:    pshufd $0x64,%xmm2,%xmm0
   0x0000000000400b06 <+38>:    mulps  %xmm5,%xmm0
   0x0000000000400b09 <+41>:    pshufd $0x12,%xmm1,%xmm1
   0x0000000000400b0e <+46>:    pshufd $0x9,%xmm2,%xmm2
   0x0000000000400b13 <+51>:    mulps  %xmm1,%xmm2
   0x0000000000400b16 <+54>:    addps  %xmm4,%xmm0
   0x0000000000400b19 <+57>:    xorps  0xc0(%rip),%xmm0        # 0x400be0
   0x0000000000400b20 <+64>:    addps  %xmm3,%xmm0
   0x0000000000400b23 <+67>:    subps  %xmm2,%xmm0
   0x0000000000400b26 <+70>:    retq

This is not bad.

No worries - comment deleted now anyway (I was about to re-write it) - obviously the portability and optimisation requirements are somewhat orthogonal. — Paul R, Aug 27 '14 at 11:02
@PaulR Yeah, there might me some creative shuffling possible, that I've missed, or something else. — user1095108, Aug 27 '14 at 11:04
Wouldn't `(t2+t3)^mask` save one `xor`? Why do you need so many casts (to `int_vector_type`, to `decltype(t2)`, etc)? — Marc Glisse, Aug 27 '14 at 12:53
@MarcGlisse Very good idea! So I can do ^, otherwise it doesn't work. — user1095108, Aug 27 '14 at 13:07

score 1 · Answer 1 · edited May 23 '17 at 11:49

You can use essentially the same code as in the question you refer to (to be honest, I haven't tested it, but I believe it should work):

#include <pmmintrin.h>

inline quat<float> operator*(quat<float> const& xyzw, quat<float> const& abcd) noexcept
{
    /* The product of two quaternions is:                                 */
    /* (X,Y,Z,W) = (xd+yc-zb+wa, -xc+yd+za+wb, xb-ya+zd+wc, -xa-yb-zc+wd) */

    __m128 wzyx = _mm_shuffle_ps(xyzw.data_, xyzw.data_, _MM_SHUFFLE(0,1,2,3));
    __m128 baba = _mm_shuffle_ps(abcd.data_, abcd.data_, _MM_SHUFFLE(0,1,0,1));
    __m128 dcdc = _mm_shuffle_ps(abcd.data_, abcd.data_, _MM_SHUFFLE(2,3,2,3));

    /* variable names below are for parts of componens of result (X,Y,Z,W) */
    /* nX stands for -X and similarly for the other components             */

    /* znxwy  = (xb - ya, zb - wa, wd - zc, yd - xc) */
    __m128 ZnXWY = _mm_hsub_ps(_mm_mul_ps(xyzw.data_, baba), _mm_mul_ps(wzyx, dcdc));

    /* xzynw  = (xd + yc, zd + wc, wb + za, yb + xa) */
    __m128 XZYnW = _mm_hadd_ps(_mm_mul_ps(xyzw.data_, dcdc), _mm_mul_ps(wzyx, baba));

    __m128 XZWY = _mm_addsub_ps(_mm_shuffle_ps(XZYnW, ZnXWY, _MM_SHUFFLE(3,2,1,0)),
                                _mm_shuffle_ps(ZnXWY, XZYnW, _MM_SHUFFLE(2,3,0,1)));

    /* now we only need to shuffle the components in place and return the result      */
    return quat<float>{_mm_shuffle_ps(XZWY, XZWY, _MM_SHUFFLE(2,1,3,0))};

    /* operations: 6 shuffles, 4 multiplications, 3 compound additions/subtractions   */
}

True, but these are not the vector extensions, but intrinsics. I.e. the code I've provided can compile for the old i386 target, or armv5, but not yours. — user1095108, Aug 27 '14 at 12:18

quaternion multiplication with gcc vector extensions

1 Answers1