I'm trying to understand the impact of strict aliasing on performance in C99. My goal is to optimize a vector dot product, which takes up a large amount of time in my program (profiled it!). I thought that aliasing could be the problem, but the following code doesn't show any substantial difference between the standard approach and the strict aliasing version, even with vectors of size 100 million. I've also tried to use local variables to avoid aliasing, with similar results.
What's happening?
I'm using gcc-4.7 on OSX 10.7.4. Results are in microseconds.
$ /usr/local/bin/gcc-4.7 -fstrict-aliasing -Wall -std=c99 -O3 -o restrict restrict.c
$ ./restrict
sum: 100000000 69542
sum2: 100000000 70432
sum3: 100000000 70372
sum4: 100000000 69891
$ /usr/local/bin/gcc-4.7 -Wall -std=c99 -O0 -fno-strict-aliasing -o restrict restrict.c
$ ./restrict
sum: 100000000 258487
sum2: 100000000 261349
sum3: 100000000 258829
sum4: 100000000 258129
restrict.c (note this code will need several hundred MB RAM):
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
#include <unistd.h>
/* original */
long sum(int *x, int *y, int n)
{
long i, s = 0;
for(i = 0 ; i < n ; i++)
s += x[i] * y[i];
return s;
}
/* restrict */
long sum2(int *restrict x, int *restrict y, int n)
{
long i, s = 0;
for(i = 0 ; i < n ; i++)
s += x[i] * y[i];
return s;
}
/* local restrict */
long sum3(int *x, int *y, int n)
{
int *restrict xr = x;
int *restrict yr = y;
long i, s = 0;
for(i = 0 ; i < n ; i++)
s += xr[i] * yr[i];
return s;
}
/* use local variables */
long sum4(int *x, int *y, int n)
{
int xr, yr;
long i, s = 0;
for(i = 0 ; i < n ; i++)
{
xr = x[i];
yr = y[i];
s += xr * yr;
}
return s;
}
int main(void)
{
struct timeval tp1, tp2;
struct timezone tzp;
long i, n = 1e8L, s;
int *x = malloc(sizeof(int) * n);
int *y = malloc(sizeof(int) * n);
long elapsed1;
for(i = 0 ; i < n ; i++)
x[i] = y[i] = 1;
gettimeofday(&tp1, &tzp);
s = sum(x, y, n);
gettimeofday(&tp2, &tzp);
elapsed1 = (tp2.tv_sec - tp1.tv_sec) * 1e6
+ (tp2.tv_usec - tp1.tv_usec);
printf("sum:\t%ld\t%ld\n", s, elapsed1);
gettimeofday(&tp1, &tzp);
s = sum2(x, y, n);
gettimeofday(&tp2, &tzp);
elapsed1 = (tp2.tv_sec - tp1.tv_sec) * 1e6
+ (tp2.tv_usec - tp1.tv_usec);
printf("sum2:\t%ld\t%ld\n", s, elapsed1);
gettimeofday(&tp1, &tzp);
s = sum3(x, y, n);
gettimeofday(&tp2, &tzp);
elapsed1 = (tp2.tv_sec - tp1.tv_sec) * 1e6
+ (tp2.tv_usec - tp1.tv_usec);
printf("sum3:\t%ld\t%ld\n", s, elapsed1);
gettimeofday(&tp1, &tzp);
s = sum3(x, y, n);
gettimeofday(&tp2, &tzp);
elapsed1 = (tp2.tv_sec - tp1.tv_sec) * 1e6
+ (tp2.tv_usec - tp1.tv_usec);
printf("sum4:\t%ld\t%ld\n", s, elapsed1);
return EXIT_SUCCESS;
}