I have this code that was working for years (and is still working when using some random compilers).
What we expect is to have the same result in sequential and in parallel execution.
The symptom is that at each execution, the parallel execution produces another result.
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char *argv[])
{
int i, N, j, sum;
int ** A;
sum=0;
N=1000;
A=(int**)malloc(N*sizeof(int*));
for (i=0;i<N;i++) {
A[i] = (int*)malloc(N *sizeof(int));
}
for (i=0; i<N; ++i) {
for (j=0; j<N; ++j) {
A[i][j]=i+j;
sum+=A[i][j];
}
}
printf("Total sum = %d \n",sum);
sum=0;
#pragma omp parallel for reduction(+:sum)
for (i=0; i<N; ++i) {
for (j=0; j<N; ++j) {
sum += A[i][j];
}
}
printf("Total sum = %d \n",sum);
for (i=0;i<N;i++){ free(A[i]);}
free(A);
return 0;
}
We compile it like that:
gcc -fopenmp reduction.c
And run it like that:
./a.out
Total sum = 999000000
Total sum = 822136991
It's working with icc.
Edit: if we use optimization -O3 with Gcc it's working also.