I am learning to program with AVX. So, I wrote a simple program to multiply matrices of size 4. While with no compiler optimizations, the AVX version is slightly faster than the non-AVX version, with O3 optimization, the non-AVX version becomes almost twice as fast as the AVX version. Any tip on how can I improve the performance of the AVX version? Following is the full code.
#include <immintrin.h>
#include <stdio.h>
#include <stdlib.h>
#define MAT_SIZE 4
#define USE_AVX
double A[MAT_SIZE][MAT_SIZE];
double B[MAT_SIZE][MAT_SIZE];
double C[MAT_SIZE][MAT_SIZE];
union {
double m[4][4];
__m256d row[4];
} matB;
void init_matrices()
{
for(int i = 0; i < MAT_SIZE; i++)
for(int j = 0; j < MAT_SIZE; j++)
{
A[i][j] = (float)(i+j);
B[i][j] = (float)(i+j+1);
matB.m[i][j] = B[i][j];
}
}
void print_result()
{
for(int i = 0; i < MAT_SIZE; i++)
{
for(int j = 0; j < MAT_SIZE; j++)
{
printf("%.1f\t", C[i][j]);
}
printf("\n");
}
}
void withoutAVX()
{
for(int row = 0; row < MAT_SIZE; row++)
for(int col = 0; col < MAT_SIZE; col++)
{
float sum = 0;
for(int e = 0; e < MAT_SIZE; e++)
sum += A[row][e] * B[e][col];
C[row][col] = sum;
}
}
void withAVX()
{
for(int row = 0; row < 4; row++)
{
//calculate_resultant_row(row);
const double* rowA = (const double*)&A[row];
__m256d* pr = (__m256d*)(&C[row]);
*pr = _mm256_mul_pd(_mm256_broadcast_sd(&rowA[0]), matB.row[0]);
for(int i = 1; i < 4; i++)
*pr = _mm256_add_pd(*pr, _mm256_mul_pd(_mm256_broadcast_sd(&rowA[i]),
matB.row[i]));
}
}
static __inline__ unsigned long long rdtsc(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}
int main()
{
init_matrices();
// start timer
unsigned long long cycles = rdtsc();
#ifdef USE_AVX
withAVX();
#else
withoutAVX();
#endif
// stop timer
cycles = rdtsc() - cycles;
printf("\nTotal time elapsed : %ld\n\n", cycles);
print_result();
return 0;
}