10

After some thought, I came up with the following code for multiplying two quaternions using SSE:

#include <pmmintrin.h> /* SSE3 intrinsics */

/* multiplication of two quaternions (x, y, z, w) x (a, b, c, d) */

__m128 _mm_cross4_ps(__m128 xyzw, __m128 abcd)
{
    /* The product of two quaternions is:                                 */
    /* (X,Y,Z,W) = (xd+yc-zb+wa, -xc+yd+za+wb, xb-ya+zd+wc, -xa-yb-zc+wd) */

    __m128 wzyx = _mm_shuffle_ps(xyzw, xyzw, _MM_SHUFFLE(0,1,2,3));
    __m128 baba = _mm_shuffle_ps(abcd, abcd, _MM_SHUFFLE(0,1,0,1));
    __m128 dcdc = _mm_shuffle_ps(abcd, abcd, _MM_SHUFFLE(2,3,2,3));

    /* variable names below are for parts of componens of result (X,Y,Z,W) */
    /* nX stands for -X and similarly for the other components             */

    /* znxwy  = (xb - ya, zb - wa, wd - zc, yd - xc) */
    __m128 ZnXWY = _mm_hsub_ps(_mm_mul_ps(xyzw, baba), _mm_mul_ps(wzyx, dcdc));

    /* xzynw  = (xd + yc, zd + wc, wb + za, yb + xa) */
    __m128 XZYnW = _mm_hadd_ps(_mm_mul_ps(xyzw, dcdc), _mm_mul_ps(wzyx, baba));

    /* _mm_shuffle_ps(XZYnW, ZnXWY, _MM_SHUFFLE(3,2,1,0)) */
    /*      = (xd + yc, zd + wc, wd - zc, yd - xc)        */
    /* _mm_shuffle_ps(ZnXWY, XZYnW, _MM_SHUFFLE(2,3,0,1)) */
    /*      = (zb - wa, xb - ya, yb + xa, wb + za)        */

    /* _mm_addsub_ps adds elements 1 and 3 and subtracts elements 0 and 2, so we get: */
    /* _mm_addsub_ps(*, *) = (xd+yc-zb+wa, xb-ya+zd+wc, wd-zc+yb+xa, yd-xc+wb+za)     */

    __m128 XZWY = _mm_addsub_ps(_mm_shuffle_ps(XZYnW, ZnXWY, _MM_SHUFFLE(3,2,1,0)),
                                _mm_shuffle_ps(ZnXWY, XZYnW, _MM_SHUFFLE(2,3,0,1)));

    /* now we only need to shuffle the components in place and return the result      */
    return _mm_shuffle_ps(XZWY, XZWY, _MM_SHUFFLE(2,1,3,0));

    /* operations: 6 shuffles, 4 multiplications, 3 compound additions/subtractions   */
}

I expected the assembly to have a minimal amount of instructions. However, when I compile it to assembly with gcc -msse3 -S, the resulting function has 67 instructions:

    .text   
    .globl __Z13_mm_cross4_psU8__vectorfS_
__Z13_mm_cross4_psU8__vectorfS_:
LFB594:
    pushq   %rbp
LCFI0:
    movq    %rsp, %rbp
LCFI1:
    subq    $232, %rsp
    movaps  %xmm0, -336(%rbp)
    movaps  %xmm1, -352(%rbp)
    movaps  -336(%rbp), %xmm0
    movaps  -336(%rbp), %xmm1
    shufps  $27, %xmm1, %xmm0
    movaps  %xmm0, -16(%rbp)
    movaps  -352(%rbp), %xmm0
    movaps  -352(%rbp), %xmm1
    shufps  $17, %xmm1, %xmm0
    movaps  %xmm0, -32(%rbp)
    movaps  -352(%rbp), %xmm0
    movaps  -352(%rbp), %xmm1
    shufps  $187, %xmm1, %xmm0
    movaps  %xmm0, -48(%rbp)
    movaps  -16(%rbp), %xmm0
    movaps  %xmm0, -112(%rbp)
    movaps  -48(%rbp), %xmm0
    movaps  %xmm0, -128(%rbp)
    movaps  -128(%rbp), %xmm0
    movaps  -112(%rbp), %xmm1
    mulps   %xmm1, %xmm0
    movaps  -336(%rbp), %xmm1
    movaps  %xmm1, -144(%rbp)
    movaps  -32(%rbp), %xmm1
    movaps  %xmm1, -160(%rbp)
    movaps  -160(%rbp), %xmm1
    movaps  -144(%rbp), %xmm2
    mulps   %xmm2, %xmm1
    movaps  %xmm1, -176(%rbp)
    movaps  %xmm0, -192(%rbp)
    movaps  -176(%rbp), %xmm0
    hsubps  -192(%rbp), %xmm0
    movaps  %xmm0, -64(%rbp)
    movaps  -16(%rbp), %xmm0
    movaps  %xmm0, -208(%rbp)
    movaps  -32(%rbp), %xmm0
    movaps  %xmm0, -224(%rbp)
    movaps  -224(%rbp), %xmm0
    movaps  -208(%rbp), %xmm1
    mulps   %xmm1, %xmm0
    movaps  -336(%rbp), %xmm1
    movaps  %xmm1, -240(%rbp)
    movaps  -48(%rbp), %xmm1
    movaps  %xmm1, -256(%rbp)
    movaps  -256(%rbp), %xmm1
    movaps  -240(%rbp), %xmm2
    mulps   %xmm2, %xmm1
    movaps  %xmm1, -272(%rbp)
    movaps  %xmm0, -288(%rbp)
    movaps  -272(%rbp), %xmm0
    haddps  -288(%rbp), %xmm0
    movaps  %xmm0, -80(%rbp)
    movaps  -64(%rbp), %xmm0
    movaps  -80(%rbp), %xmm1
    shufps  $177, %xmm1, %xmm0
    movaps  -80(%rbp), %xmm1
    movaps  -64(%rbp), %xmm2
    shufps  $228, %xmm2, %xmm1
    movaps  %xmm1, -304(%rbp)
    movaps  %xmm0, -320(%rbp)
    movaps  -304(%rbp), %xmm0
    addsubps        -320(%rbp), %xmm0
    movaps  %xmm0, -96(%rbp)
    movaps  -96(%rbp), %xmm0
    movaps  -96(%rbp), %xmm1
    shufps  $156, %xmm1, %xmm0
    leave
LCFI2:
    ret

What am I doing wrong? There must a better way to shuffle the elements around without using so many movaps instructions.

Guilherme Amadio
  • 372
  • 2
  • 13
  • To get the most out of SIMD you should do multiple operations in parallel not one operation. For SSE this means you should do four quaternion multiplications (multiply eight quaternions) at once. Then you don't have to use any horizontal operators and you should get 4x the speed of non-SIMD code. – Z boson Sep 01 '13 at 18:48
  • Try to avoid horizontal operations -- I think the same number of instructions is possible when arranging the calculation as vertical sequence of `xyzw.*dddd + yzwx.*cacA + zwxy.*BbbB - wxyz.*Acac` and (`A=-a, B=-b`). – Aki Suihkonen Sep 03 '13 at 06:33
  • I understand what you have to say, but a quaternion is already composed of four floats, and usually in my code I don't have a loop with many multiplications, but a single multiplication at a time, so doing several quaternions at once doesn't help. – Guilherme Amadio Sep 09 '13 at 20:06

2 Answers2

5

Never mind. If I compile the code with gcc -msse3 -O1 -S instead, I get the following:

    .text
    .align 4,0x90
    .globl __Z13_mm_cross4_psU8__vectorfS_
__Z13_mm_cross4_psU8__vectorfS_:
LFB644:
    movaps  %xmm0, %xmm5
    movaps  %xmm1, %xmm3
    movaps  %xmm0, %xmm2
    shufps  $27, %xmm0, %xmm5
    movaps  %xmm5, %xmm4
    shufps  $17, %xmm1, %xmm3
    shufps  $187, %xmm1, %xmm1
    mulps   %xmm3, %xmm2
    mulps   %xmm1, %xmm4
    mulps   %xmm5, %xmm3
    mulps   %xmm1, %xmm0
    hsubps  %xmm4, %xmm2
    haddps  %xmm3, %xmm0
    movaps  %xmm2, %xmm1
    shufps  $177, %xmm0, %xmm1
    shufps  $228, %xmm2, %xmm0
    addsubps        %xmm1, %xmm0
    shufps  $156, %xmm0, %xmm0
    ret

That's only 18 instructions now. That's what I expected in the beginning. Oops.

Guilherme Amadio
  • 372
  • 2
  • 13
4

You may be interested in the Agner Fog's C++ vector class library. It provides a Quaternion4f and Quaternion4d classes (including * and *= operators, of course), implemented by using SSE2 and AVX instruction sets respectively. The library is an Open Source project, so you may dig into the code and find a good implementation example to build your function on.

Later on, you may consult the "optimizing subroutines in assembly language" manual and provide an optimized, pure assembly implementation of the function or, while being aware of some low-level tricks, try to redesign the intrinsics approach in C.

Krzysztof Abramowicz
  • 1,556
  • 1
  • 12
  • 30