The -ffast-math
option in gcc allows the compiler to reorder floating-point operations to have a faster execution.
This may lead to slight differences between the results of these operations depending on the alignment of pointers. For instance, on x64, some optimized instructions (AVX) are faster on pointers that are 128-bits aligned so it makes sense that the compiler could be allowed to do that.
Here is an example of a simple program that shows such a behavior on a modern CPU:
#include <stdio.h>
#include <stdint.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
double sum(int size, double *a) {
double r = 0.0;
for (int k = 0;k < size; k ++) {
r+= a[k];
}
return r;
}
void init(int size, double *a) {
srand(0);
for (int k = 0; k < size; k ++) {
a[k] = ((double) rand()) / RAND_MAX;
}
}
void test(int size, double *ref, double *arr) {
init (size, ref);
memcpy (arr, ref, sizeof *arr * size);
printf("Alignment: ref:%d arr:%d diff:%g\n",
(int)((uintptr_t)ref % 32),
(int)((uintptr_t)arr % 32),
fabs(sum(size, arr) - sum(size, ref)));
}
int main(int argc, char **argv) {
int size = argc <= 1 ? 100 : atoi(argv[1]); // (don't do that at home)
double *array1 = malloc(size * sizeof *array1);
double *array2 = malloc((size + 4) * sizeof *array2);
printf("size = %d\n", size);
if (array1 && array2) {
for (int k = 0;k < 4;k ++){
test(size, array1, array2 + k);
}
}
}
That may output when compiled with -Ofast
:
$ ./test 100000
size = 100000
Alignment: ref:16 arr:16 diff:0
Alignment: ref:16 arr:24 diff:7.27596e-12
Alignment: ref:16 arr:0 diff:0
Alignment: ref:16 arr:8 diff:7.27596e-12
Question
Is there a magical flag that politely ask the compiler not to generate code that is sensitive to the alignment of pointers without completely forbidding "fast-math" optimizations ?
More Details
Here is the configuration of my gcc:
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/7/lto-wrapper
OFFLOAD_TARGET_NAMES=nvptx-none
OFFLOAD_TARGET_DEFAULT=1
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu 7.4.0-1ubuntu1~18.04.1' --with-bugurl=file:///usr/share/doc/gcc-7/README.Bugs --enable-languages=c,ada,c++,go,brig,d,fortran,objc,obj-c++ --prefix=/usr --with-gcc-major-version-only --program-suffix=-7 --program-prefix=x86_64-linux-gnu- --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-libmpx --enable-plugin --enable-default-pie --with-system-zlib --with-target-system-zlib --enable-objc-gc=auto --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-offload-targets=nvptx-none --without-cuda-driver --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
Thread model: posix
gcc version 7.4.0 (Ubuntu 7.4.0-1ubuntu1~18.04.1)
I simply compiled the above code with nothing more than "gcc -Ofast".
By looking at the generated assembly code, it seems that the compiler is implementing the following pseudo-code:
typedef struct {
double lower;
double upper;
} packed_double128;
double sum6(int size, double *a) { ... an array of size smaller than 6 and at least one in an "unrolled" fashion ... }
double sum(int size, double *a) {
if (size == 0) {
return 0.0;
} else if (size - 1 <= 5) {
int delta;
double tmp;
if (((intptr_t) a) % 16) {
delta = 1;
tmp = a[0];
} else {
delta = 0;
tmp = 0.0;
}
packed_double128 res = {0., 0.};
double *p = a + delta;
assert (((intptr_t) p) % 16 == 0); // p is aligned !
for (k = 0; k < (size - delta) / 2; k ++) {
res.lower += *p++;
res.upper += *p++;
}
double res = res.lower + res.upper + tmp;
int remain = size - 2*((size - delta) / 2);
if (remain)
return res + sum6(remain, p);
else
return res;
} else {
return sum6(size, a);
}
}
For instance, if a is the array {0, 1, 2,.., 10} sum
computes:
((0+2+4+6+8)+(1+3+5+7+9))+10 if it a is "aligned"
((2+4+6+8+10)+(1+3+5+7+9))+1 if a is not "aligned"
Here is the assembly code (the comment are mine and may be wrong):
sum: // double sum (int size, double *a)
.LFB61:
.cfi_startproc
testl %edi, %edi // size = 0 ?
jle .L8 ;
movq %rsi, %rax ; // rax = a
leal -1(%rdi), %edx ; // edx = size - 1
shrq $3, %rax // rax = rax >> 3
andl $1, %eax // rax = rax % 2
// now eax contains either 1 or 0 depending the alignment, let's call this value "delta"
cmpl $5, %edx // size - 1 <= 5 ?
jbe .L9
testl %eax, %eax // a % 16 = 0 ? if so, skip the next two mov (and initialize xmm2 to 0)
je .L10
movsd (%rsi), %xmm2 // xmm2 = a[0] (we put aside the first element)
movl $1, %ecx // ecx = 1
.L4:
movl %edi, %r9d // r9d = size
pxor %xmm1, %xmm1 // xmmm1 = 0
subl %eax, %r9d // r9d = size - delta
leaq (%rsi,%rax,8), %rdx // rdx = a + eax * 8 (which is "&a[rax + delta]")
xorl %eax, %eax // eax = 0
movl %r9d, %r8d
shrl %r8d // r8d = (size - delta) / 2
.p2align 4,,10
.p2align 3
.L5: // HERE IS THE "real loop"
addl $1, %eax // eax ++
addpd (%rdx), %xmm1 // xmm1 += *rdx (thas's adding 2 doubles into xmm1)
addq $16, %rdx // rdx += 2 * sizeof(double)
cmpl %r8d, %eax // eax < (size - delta) / 2 ?
jb .L5
// Here xmm1 contains two halves of our sum:
// one double is the sum of odds elements and the other the sums of even elements
movdqa %xmm1, %xmm0 // xmm0 = xmm1
movl %r9d, %edx // edx = size - delta
andl $-2, %edx // edx = r9d - (r9d % 2)
psrldq $8, %xmm0 // xmm0[0] = xmm0[1]; xmm0[1] = 0.0;
addpd %xmm0, %xmm1 // xmm1 += xmm0 (which now means that xmm1[0] contains the final result \o/ )
cmpl %edx, %r9d // r9d = r9d - (r9d % 2) ? eg. r9d % 2 = 0 ?
leal (%rdx,%rcx), %eax // eax = rdx + 1
movapd %xmm1, %xmm0
addsd %xmm2, %xmm0 // xmm0 = xmm1 + xmm2[0] Here we add the skipped element in case of misalignment
je .L13
.L3: // BORING UNROLLED LOOP FOR SMALL ARRAYS (size <= 6)
movslq %eax, %rdx
addsd (%rsi,%rdx,8), %xmm0
leal 1(%rax), %edx
cmpl %edx, %edi
jle .L1
movslq %edx, %rdx
addsd (%rsi,%rdx,8), %xmm0
test 2(%rax), %edx
cmpl %edx, %edi
jle .L1
movslq %edx, %rdx
addsd (%rsi,%rdx,8), %xmm0
leal 3(%rax), %edx
cmpl %edx, %edi
jle .L1
movslq %edx, %rdx
addsd (%rsi,%rdx,8), %xmm0
leal 4(%rax), %edx
cmpl %edx, %edi
jle .L1
addl $5, %eax
movslq %edx, %rdx
cmpl %eax, %edi
addsd (%rsi,%rdx,8), %xmm0
jle .L1
cltq
addsd (%rsi,%rax,8), %xmm0
ret
.p2align 4,,10
.p2align 3
.L8:
pxor %xmm0, %xmm0
.L1:
rep ret
.p2align 4,,10
.p2align 3
.L10:
xorl %ecx, %ecx
pxor %xmm2, %xmm2
jmp .L4
.p2align 4,,10
.p2align 3
.L13:
rep ret
.p2align 4,,10
.p2align 3
.L9:
xorl %eax, %eax
pxor %xmm0, %xmm0
jmp .L3
.cfi_endproc