I have 3 versions of gcc
installed on my linux 64 bit machine
- gcc 4.9.2
- gcc 5.3.0
- gcc 6 [ a build from an svn snapshot ]
all 3 compilers give me the same error when I try to explcitly reserve xmm
registers with
-ffixed-xmm0 -ffixed-xmm1 -ffixed-xmm2 -ffixed-xmm3 -ffixed-xmm4 -ffixed-xmm5 -ffixed-xmm6 -ffixed-xmm7 -ffixed-xmm8 -ffixed-xmm9 -ffixed-xmm10 -ffixed-xmm11 -ffixed-xmm12 -ffixed-xmm13 -ffixed-xmm14 -ffixed-xmm15
and the error is a compiler error
internal compiler error: in copy_to_mode_reg, at explow.c:595
return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B);
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Please submit a full bug report,
with preprocessed source if appropriate.
should I file a bug ? I have noticed that clang
doesn't support a similar flag to control code generation, so maybe the gcc
created this flag long time ago and now it's just not worth it ?
When I look at the assembly code generated from my C function using clang
there is no byte spill and it looks like all the xmm registers are being used as instruncted, but gcc
on the other hand doesn't really generate a clean assembly and I still would like to impose this behaviour .
There is another way to force a given usage of SSE and AVX registers ? It's possible to get a warning when there is a misuse of the registers ?
Thanks.
dummy function for testing purposes
#include <stdio.h>
#include <stdint.h>
#include <malloc.h>
#include <emmintrin.h>
typedef int32_t T;
void foo( T * ptr ) {
__m128i v0 = _mm_load_si128( (__m128i *) ( &ptr[0] ) );
__m128i v1 = _mm_load_si128( (__m128i *) ( &ptr[4] ) );
__m128i v2 = _mm_load_si128( (__m128i *) ( &ptr[8] ) );
__m128i v3 = _mm_load_si128( (__m128i *) ( &ptr[12] ) );
__m128i v4 = _mm_load_si128( (__m128i *) ( &ptr[16] ) );
__m128i v5 = _mm_load_si128( (__m128i *) ( &ptr[20] ) );
__m128i v6 = _mm_load_si128( (__m128i *) ( &ptr[24] ) );
__m128i v7 = _mm_load_si128( (__m128i *) ( &ptr[28] ) );
__m128i v8 = _mm_load_si128( (__m128i *) ( &ptr[32] ) );
__m128i v9 = _mm_load_si128( (__m128i *) ( &ptr[36] ) );
__m128i v10 = _mm_load_si128( (__m128i *) ( &ptr[40] ) );
__m128i v11 = _mm_load_si128( (__m128i *) ( &ptr[44] ) );
__m128i v12 = _mm_load_si128( (__m128i *) ( &ptr[48] ) );
__m128i v13 = _mm_load_si128( (__m128i *) ( &ptr[52] ) );
__m128i v14 = _mm_load_si128( (__m128i *) ( &ptr[56] ) );
__m128i v15 = _mm_load_si128( (__m128i *) ( &ptr[60] ) );
v0 = _mm_adds_epi16( v0, v1 );
v0 = _mm_adds_epi16( v0, v2 );
v0 = _mm_adds_epi16( v0, v3 );
v0 = _mm_adds_epi16( v0, v4 );
v0 = _mm_adds_epi16( v0, v5 );
v0 = _mm_adds_epi16( v0, v6 );
v0 = _mm_adds_epi16( v0, v7 );
v0 = _mm_adds_epi16( v0, v8 );
v0 = _mm_adds_epi16( v0, v9 );
v0 = _mm_adds_epi16( v0, v10 );
v0 = _mm_adds_epi16( v0, v11 );
v0 = _mm_adds_epi16( v0, v12 );
v0 = _mm_adds_epi16( v0, v13 );
v0 = _mm_adds_epi16( v0, v14 );
v0 = _mm_adds_epi16( v0, v15 );
_mm_store_si128( (__m128i *) ptr, v0 );
}