0

I'm a beginner in Arm Neon, and I'm trying to vectorise this loop

float ans=0.0;
for (i=0; i<numdims; i++)
ans += (pt1[i]-pt2[i]) * (pt1[i]-pt2[i]);

I'm trying to convert this function in Neon with prefetch instruction and loop-unrolling

int iter= numdims/4*4;
   float result[3];
   float ans=0.0;

    asm volatile(
            "mov x1, #0\n\t"
            "mov x2, %[pt1]\n\t"
            "mov x3, %[pt2]\n\t"
            "movi v3.4s, #0\n\t"

            ".loop_neon%=:\n\t"
            "prfm PLDL1STRM, [x2, #64]\n\t"
            "prfm PLDL1STRM, [x3, #64]\n\t"
            "ldr q1, [x2, #16]\n\t"
            "ldr q2, [x3, #16]\n\t"

            "fsub v4.4s, v1.4s, v2.4s\n\t"
            "fmla v3.4s, v4.4s, v4.4s\n\t"

            "add x1,x1, #16\n\t"
            "cmp x1, %[iter]\n\t"
            "b.lt .loop_neon%=\n\t"
            "str q3, [%[result]]\n\t"
            :
            : [iter] "r" (iter),[pt1] "r" (pt1),[pt2] "r" (pt2), [result] "r" (result)
            : "x1","x2","x3","memory","v0","v1","v2","v3","v4"
      );

ans = result[0] + result[1] + result[2] + result[3];

      //final iterations of the loop
      for (int i=iter; i<numdims; i++)
          ans += (pt1[i]-pt2[i]) * (pt1[i]-pt2[i]);

This code works but the output is not correct

Snake91
  • 21
  • 1
  • 4
  • Re “This code works but the output is not correct”: That is not how we define “works.” If the output is not correct, the program does not work. Do you mean the program executes and completes with a success status, rather than aborting with some exception? Show a [mre]. – Eric Postpischil Apr 18 '20 at 00:30
  • why do you add 16 if you process 4 per iteration? – Jake 'Alquimista' LEE Apr 18 '20 at 05:05

1 Answers1

1

Short answer: add x1, x1, #4

Your code is far from optimal:

  • there are lots of pipeline hazards. unroll deeper
  • you should always count down the loop counter
  • you should avoid unnecessary memory access (result)
  • you should avoid unnecessary mov operations

Provided iter is a multiple of 16, the code below is suggested:

.func
// extern float sumDiffSquare(float *pA, float *pB, uint32_t length);
// assert(length >= 16);
// assert(length & 15 == 0);
pA      .req    x0
pB      .req    x1
length  .req    x2

sumDiffSqare:
    movi    v0.16b, #0

.balign 64
1:
    ldp     q16, q17, [pA], #32
    ldp     q20, q21, [pB], #32
    ldp     q18, q19, [pA], #32
    ldp     q22, q23, [pB], #32

    subs    length, length, #16

    fsub    v16.4s, v20.4s, v16.4s
    fsub    v17.4s, v21.4s, v17.4s
    fsub    v18.4s, v22.4s, v18.4s
    fsub    v19.4s, v23.4s, v19.4s

    fmla    v0.4s, v16.4s, v16.4s
    fmla    v0.4s, v17.4s, v17.4s
    fmla    v0.4s, v18.4s, v18.4s
    fmla    v0.4s, v19.4s, v19.4s

    b.gt    1b

    faddp   v0.4s, v0.4s, v0.4s
    faddp   v0.2s, v0.2s, v0.2s
    ret

.endfunc
Jake 'Alquimista' LEE
  • 6,197
  • 2
  • 17
  • 25