Why does FADDP D-form have higher throughput than FADDP Q-form on the Cortex-A72

Question

I've been operating on a rough rule of thumb that Q-form ASIMD instructions are as good or better than D-form if you've got enough data to operate on. I was therefore surprised to see when reading §3.15 of the Cortex-A72 Software Optimization Guide that FADDP has a throughput of 2 for D-form and 2/3 for Q-form (for reference the latencies are 4 and 7 respectively). All of the other instructions that have different performance for D and Q form have only a minor latency difference at worst (e.g. 3 vs. 4 for FRINTX) and have throughput either the same or exactly half.

What's so special about FADDP that its throughput is cut by a third for the Q-form, and (if you have the frontend bandwidth) can you really get increased throughput by replacing the Q-form with two D-form instructions?

Testing and Benchmarking:

I wrote up a couple c++ functions to try exercising a cortex-a72 both ways:

void q(float *i) {
  auto x = vld1q_f32_x4(i);
  auto y = vld1q_f32_x4(i + 16);
  for (int i = 0; i < 8192; ++i) {
    x.val[0] = vpaddq_f32(x.val[0], y.val[0]);
    x.val[1] = vpaddq_f32(x.val[1], y.val[1]);
    x.val[2] = vpaddq_f32(x.val[2], y.val[2]);
    x.val[3] = vpaddq_f32(x.val[3], y.val[3]);
    y.val[0] = vpaddq_f32(x.val[0], y.val[0]);
    y.val[1] = vpaddq_f32(x.val[1], y.val[1]);
    y.val[2] = vpaddq_f32(x.val[2], y.val[2]);
    y.val[3] = vpaddq_f32(x.val[3], y.val[3]);
  }
  vst1q_f32_x4(i, x);
}

void d(float *i) {
  auto x0 = vld1_f32_x4(i);
  auto x1 = vld1_f32_x4(i + 8);
  auto y0 = vld1_f32_x4(i + 16);
  auto y1 = vld1_f32_x4(i + 24);
  for (int i = 0; i < 8192; ++i) {
    x0.val[0] = vpadd_f32(x0.val[0], x0.val[1]);
    x0.val[1] = vpadd_f32(y0.val[0], y0.val[1]);
    x0.val[2] = vpadd_f32(x0.val[2], x0.val[3]);
    x0.val[3] = vpadd_f32(y0.val[2], y0.val[3]);
    x1.val[0] = vpadd_f32(x1.val[0], x1.val[1]);
    x1.val[1] = vpadd_f32(y1.val[0], y1.val[1]);
    x1.val[2] = vpadd_f32(x1.val[2], x1.val[3]);
    x1.val[3] = vpadd_f32(y1.val[2], y1.val[3]);
    y0.val[0] = vpadd_f32(x0.val[0], x0.val[1]);
    y0.val[1] = vpadd_f32(y0.val[0], y0.val[1]);
    y0.val[2] = vpadd_f32(x0.val[2], x0.val[3]);
    y0.val[3] = vpadd_f32(y0.val[2], y0.val[3]);
    y1.val[0] = vpadd_f32(x1.val[0], x1.val[1]);
    y1.val[1] = vpadd_f32(y1.val[0], y1.val[1]);
    y1.val[2] = vpadd_f32(x1.val[2], x1.val[3]);
    y1.val[3] = vpadd_f32(y1.val[2], y1.val[3]);
  }
  vst1_f32_x4(i, x0);
  vst1_f32_x4(i + 8, x1);
}

When compiled with clang, and -O3 they yeild the following:

0000000000400a84 <_Z1qPf>:
  400a84:       aa0003e8        mov     x8, x0
  400a88:       4cdf2900        ld1     {v0.4s-v3.4s}, [x8], #64
  400a8c:       4c402904        ld1     {v4.4s-v7.4s}, [x8]
  400a90:       52840008        mov     w8, #0x2000                     // #8192
  400a94:       4ea21c54        mov     v20.16b, v2.16b
  400a98:       4ea11c35        mov     v21.16b, v1.16b
  400a9c:       4ea01c10        mov     v16.16b, v0.16b
  400aa0:       4ea61cd6        mov     v22.16b, v6.16b
  400aa4:       4ea51cb7        mov     v23.16b, v5.16b
  400aa8:       4ea41c98        mov     v24.16b, v4.16b
  400aac:       6e38d610        faddp   v16.4s, v16.4s, v24.4s
  400ab0:       6e37d6b5        faddp   v21.4s, v21.4s, v23.4s
  400ab4:       6e36d694        faddp   v20.4s, v20.4s, v22.4s
  400ab8:       6e27d463        faddp   v3.4s, v3.4s, v7.4s
  400abc:       71000508        subs    w8, w8, #0x1
  400ac0:       6e38d618        faddp   v24.4s, v16.4s, v24.4s
  400ac4:       6e37d6b7        faddp   v23.4s, v21.4s, v23.4s
  400ac8:       6e36d696        faddp   v22.4s, v20.4s, v22.4s
  400acc:       6e27d467        faddp   v7.4s, v3.4s, v7.4s
  400ad0:       54fffee1        b.ne    400aac <_Z1qPf+0x28>  // b.any
  400ad4:       4eb51eb1        mov     v17.16b, v21.16b
  400ad8:       4eb41e92        mov     v18.16b, v20.16b
  400adc:       4ea31c73        mov     v19.16b, v3.16b
  400ae0:       4c002810        st1     {v16.4s-v19.4s}, [x0]
  400ae4:       d65f03c0        ret

0000000000400ae8 <_Z1dPf>:
  400ae8:       fc1c0fee        str     d14, [sp, #-64]!
  400aec:       6d0133ed        stp     d13, d12, [sp, #16]
  400af0:       6d022beb        stp     d11, d10, [sp, #32]
  400af4:       6d0323e9        stp     d9, d8, [sp, #48]
  400af8:       aa0003e8        mov     x8, x0
  400afc:       0cdf2900        ld1     {v0.2s-v3.2s}, [x8], #32
  400b00:       91010009        add     x9, x0, #0x40
  400b04:       0c402930        ld1     {v16.2s-v19.2s}, [x9]
  400b08:       91018009        add     x9, x0, #0x60
  400b0c:       0c402904        ld1     {v4.2s-v7.2s}, [x8]
  400b10:       0c402934        ld1     {v20.2s-v23.2s}, [x9]
  400b14:       52840009        mov     w9, #0x2000                     // #8192
  400b18:       4ea11c29        mov     v9.16b, v1.16b
  400b1c:       4ea01c18        mov     v24.16b, v0.16b
  400b20:       4ea51ca8        mov     v8.16b, v5.16b
  400b24:       4ea41c9c        mov     v28.16b, v4.16b
  400b28:       4eb21e4b        mov     v11.16b, v18.16b
  400b2c:       4eb11e2a        mov     v10.16b, v17.16b
  400b30:       4eb01e0e        mov     v14.16b, v16.16b
  400b34:       4eb61ecd        mov     v13.16b, v22.16b
  400b38:       4eb71eec        mov     v12.16b, v23.16b
  400b3c:       2e23d442        faddp   v2.2s, v2.2s, v3.2s
  400b40:       2e27d4c6        faddp   v6.2s, v6.2s, v7.2s
  400b44:       2e29d718        faddp   v24.2s, v24.2s, v9.2s
  400b48:       2e2ad5c9        faddp   v9.2s, v14.2s, v10.2s
  400b4c:       2e28d79c        faddp   v28.2s, v28.2s, v8.2s
  400b50:       2e35d688        faddp   v8.2s, v20.2s, v21.2s
  400b54:       2e33d563        faddp   v3.2s, v11.2s, v19.2s
  400b58:       2e2cd5a7        faddp   v7.2s, v13.2s, v12.2s
  400b5c:       2e29d70e        faddp   v14.2s, v24.2s, v9.2s
  400b60:       2e28d794        faddp   v20.2s, v28.2s, v8.2s
  400b64:       2e23d44b        faddp   v11.2s, v2.2s, v3.2s
  400b68:       2e27d4cd        faddp   v13.2s, v6.2s, v7.2s
  400b6c:       71000529        subs    w9, w9, #0x1
  400b70:       2e2ad5ca        faddp   v10.2s, v14.2s, v10.2s
  400b74:       2e35d695        faddp   v21.2s, v20.2s, v21.2s
  400b78:       2e33d573        faddp   v19.2s, v11.2s, v19.2s
  400b7c:       2e2cd5ac        faddp   v12.2s, v13.2s, v12.2s
  400b80:       54fffde1        b.ne    400b3c <_Z1dPf+0x54>  // b.any
  400b84:       4ea91d39        mov     v25.16b, v9.16b
  400b88:       4ea81d1d        mov     v29.16b, v8.16b
  400b8c:       4ea21c5a        mov     v26.16b, v2.16b
  400b90:       4ea61cde        mov     v30.16b, v6.16b
  400b94:       4ea31c7b        mov     v27.16b, v3.16b
  400b98:       4ea71cff        mov     v31.16b, v7.16b
  400b9c:       0c002818        st1     {v24.2s-v27.2s}, [x0]
  400ba0:       0c00291c        st1     {v28.2s-v31.2s}, [x8]
  400ba4:       6d4323e9        ldp     d9, d8, [sp, #48]
  400ba8:       6d422beb        ldp     d11, d10, [sp, #32]
  400bac:       6d4133ed        ldp     d13, d12, [sp, #16]
  400bb0:       fc4407ee        ldr     d14, [sp], #64
  400bb4:       d65f03c0        ret

Those main loops look to me like they didn't find any tricks to avoid the computation, and its just a straight line 8 q-form faddp's vs 16 d-form.

And the results are the following when benchmarking with perf:

Clocks per call
==============================
q       d
98631   90285

Which doesn't quite hit the gains suggested by the document (q actually comes really close to the theoretical 98304 cycles the document suggests 8192 * 8 faddp's should take, d must be running into latency issue which isn't all that surprising since there's a dependency between 0x400b4c and 0x400b60, with just 4 instructions between them). But are gains nonetheless which seem to imply that d-form has some advantage.

@chtz not absolutely positive, but clean fractional throughputs like 2/3 are not unheard of — Steve Cox, Mar 29 '21 at 14:22
@chtz: I think it's definitely a fraction, from other entries in the document. When they mean a range they use `-`. — Nate Eldredge, Mar 29 '21 at 14:53
And for fdiv for examples, ranges and fractions are used in the same entry — Steve Cox, Mar 29 '21 at 15:05
Yes, looking at the document, I agree that only 2/3 makes sense. Maybe Q-Form just requires more shuffling internally? For the D-Form, 2 input elements need to be repositioned, for the Q-Form 6 input elements (but that is complete guesswork, I have no idea what actually happens internally) — chtz, Mar 29 '21 at 15:53
@chtz I agree Q is more work, but what's stopping the backend just implementing the Q form by dispatching 2 D-form uops if it would be faster anyway? — Steve Cox, Mar 29 '21 at 15:57
It does that for normal vertical `fadd`, but apparently not for horizontal pairs `faddp`. Does `faddp` have any interaction between low and high half, across the 64-bit boundary? — Peter Cordes, Mar 29 '21 at 16:22
@PeterCordes yeah its a horizontal add, so lanes 2-3 need to be added together and placed over in lane 1 — Steve Cox, Mar 29 '21 at 16:27
Yeah, just found the same thing myself: (https://developer.arm.com/documentation/dui0801/h/A64-SIMD-Vector-Instructions/FADDP--vector-?lang=en), so it's just like x86 `haddps` which is also slow on x86. (always 2 shuffles feeding a vertical add). — Peter Cordes, Mar 29 '21 at 16:28
yeah i get why cross lane stuff is extra hard. but arm64 lets me directly index into the architectural d registers if I want so it seems like I could just write the two separate d-form instructions. There must be some gotcha that keeps me from doing that, otherwise they would have just implemented it that way. Perhaps there's extra bypass delay when switching over from d-domain to q-domain, but I can't find it in the document — Steve Cox, Mar 29 '21 at 16:31
Oh right, the same output could be obtained with two D-sized `faddp` operations, each one one reading the low and high halves of one Q input. But they'd have to write to two halves of the same Q register at once. And only q0..15 have D registers that alias their halves, but the ARM64 instruction has to work on all of q0..31. IDK if either of those are significant for A72, though; I assume it does register renaming so all the physical registers need to be orthogonal. And for non-pair `fadd` to work the way it does, the only difference is in reading 2 halves of one reg. — Peter Cordes, Mar 29 '21 at 16:37
I think you might be confusing A32 and A64 with the description. In A64 all 32 S/D/Q registers are full width, S0 is the low 32-bits of D0 is the low 64-bits of Q0, S1 is the low 32-bits of D1, is the low 64-bits of Q1, etc. — James Greenhalgh, Mar 31 '21 at 09:22
@JamesGreenhalgh That seems like it has the makings of an answer, as it would entirely explain the gotcha I was missing — Steve Cox, Mar 31 '21 at 10:39
So the point is that to replicate one Q-form `faddp`, you'd need two D-form `faddp` plus an additional uop to merge the results into a single register, making it less surprising that the Q-form is less than half the speed of D-form? — Nate Eldredge, Mar 31 '21 at 18:39
@NateEldredge Yeah I think you would actually need to use EXT, to extract the upper d, and INS to put it back. Totally makes sense that all this extra perm causes another 3 cycles of latency (EXT is exactly 3 cycles of latency) and similarly limits the throughput. I just mistakenly thought the aarch32 register model carried over to aarch64. If no one takes credit for this answer soon, I'll just write it up and answer my own question so that the answer exists in a more permanent format than the comments. — Steve Cox, Mar 31 '21 at 18:59

Why does FADDP D-form have higher throughput than FADDP Q-form on the Cortex-A72

Testing and Benchmarking:

0 Answers0