I'm a beginner in Arm Neon, and I'm trying to vectorise this loop
float ans=0.0;
for (i=0; i<numdims; i++)
ans += (pt1[i]-pt2[i]) * (pt1[i]-pt2[i]);
I'm trying to convert this function in Neon with prefetch instruction and loop-unrolling
int iter= numdims/4*4;
float result[3];
float ans=0.0;
asm volatile(
"mov x1, #0\n\t"
"mov x2, %[pt1]\n\t"
"mov x3, %[pt2]\n\t"
"movi v3.4s, #0\n\t"
".loop_neon%=:\n\t"
"prfm PLDL1STRM, [x2, #64]\n\t"
"prfm PLDL1STRM, [x3, #64]\n\t"
"ldr q1, [x2, #16]\n\t"
"ldr q2, [x3, #16]\n\t"
"fsub v4.4s, v1.4s, v2.4s\n\t"
"fmla v3.4s, v4.4s, v4.4s\n\t"
"add x1,x1, #16\n\t"
"cmp x1, %[iter]\n\t"
"b.lt .loop_neon%=\n\t"
"str q3, [%[result]]\n\t"
:
: [iter] "r" (iter),[pt1] "r" (pt1),[pt2] "r" (pt2), [result] "r" (result)
: "x1","x2","x3","memory","v0","v1","v2","v3","v4"
);
ans = result[0] + result[1] + result[2] + result[3];
//final iterations of the loop
for (int i=iter; i<numdims; i++)
ans += (pt1[i]-pt2[i]) * (pt1[i]-pt2[i]);
This code works but the output is not correct