I have tried to test OpenMP and MPI parallel implementation for inner products of two vectors (element values are computed on the fly) and find out that OpenMP is slower than MPI. The MPI code I am using is as following,
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <mpi.h>
int main(int argc, char* argv[])
{
double ttime = -omp_get_wtime();
int np, my_rank;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &np);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
int n = 10000;
int repeat = 10000;
int sublength = (int)(ceil((double)(n) / (double)(np)));
int nstart = my_rank * sublength;
int nend = nstart + sublength;
if (nend >n )
{
nend = n;
sublength = nend - nstart;
}
double dot = 0;
double sum = 1;
int j, k;
double time = -omp_get_wtime();
for (j = 0; j < repeat; j++)
{
double loc_dot = 0;
for (k = 0; k < sublength; k++)
{
double temp = sin((sum+ nstart +k +j)/(double)(n));
loc_dot += (temp * temp);
}
MPI_Allreduce(&loc_dot, &dot, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
sum += (dot/(double)(n));
}
time += omp_get_wtime();
if (my_rank == 0)
{
ttime += omp_get_wtime();
printf("np = %d sum = %f, loop time = %f sec, total time = %f \n", np, sum, time, ttime);
}
return 0;
}
I have tried several different implementation with OpenMP. Here is the version which not to complicate and close to best performance I can achieve.
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <omp.h>
int main(int argc, char* argv[])
{
int n = 10000;
int repeat = 10000;
int np = 1;
if (argc > 1)
{
np = atoi(argv[1]);
}
omp_set_num_threads(np);
int nstart =0;
int sublength =n;
double loc_dot = 0;
double sum = 1;
#pragma omp parallel
{
int i, j, k;
double time = -omp_get_wtime();
for (j = 0; j < repeat; j++)
{
#pragma omp for reduction(+: loc_dot)
for (k = 0; k < sublength; k++)
{
double temp = sin((sum+ nstart +k +j)/(double)(n));
loc_dot += (temp * temp);
}
#pragma omp single
{
sum += (loc_dot/(double)(n));
loc_dot =0;
}
}
time += omp_get_wtime();
#pragma omp single nowait
printf("sum = %f, time = %f sec, np = %d\n", sum, time, np);
}
return 0;
}
here is my test results:
OMP
sum = 6992.953984, time = 0.409850 sec, np = 1
sum = 6992.953984, time = 0.270875 sec, np = 2
sum = 6992.953984, time = 0.186024 sec, np = 4
sum = 6992.953984, time = 0.144010 sec, np = 8
sum = 6992.953984, time = 0.115188 sec, np = 16
sum = 6992.953984, time = 0.195485 sec, np = 32
MPI
sum = 6992.953984, time = 0.381701 sec, np = 1
sum = 6992.953984, time = 0.243513 sec, np = 2
sum = 6992.953984, time = 0.158326 sec, np = 4
sum = 6992.953984, time = 0.102489 sec, np = 8
sum = 6992.953984, time = 0.063975 sec, np = 16
sum = 6992.953984, time = 0.044748 sec, np = 32
Can anyone tell me what I am missing? thanks!
update: I have written an acceptable reduce function for OMP. the perfomance is close to MPI reduce function now. the code is as following.
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <omp.h>
double darr[2][64];
int nreduce=0;
#pragma omp threadprivate(nreduce)
double OMP_Allreduce_dsum(double loc_dot,int tid,int np)
{
darr[nreduce][tid]=loc_dot;
#pragma omp barrier
double dsum =0;
int i;
for (i=0; i<np; i++)
{
dsum += darr[nreduce][i];
}
nreduce=1-nreduce;
return dsum;
}
int main(int argc, char* argv[])
{
int np = 1;
if (argc > 1)
{
np = atoi(argv[1]);
}
omp_set_num_threads(np);
double ttime = -omp_get_wtime();
int n = 10000;
int repeat = 10000;
#pragma omp parallel
{
int tid = omp_get_thread_num();
int sublength = (int)(ceil((double)(n) / (double)(np)));
int nstart = tid * sublength;
int nend = nstart + sublength;
if (nend >n )
{
nend = n;
sublength = nend - nstart;
}
double sum = 1;
double time = -omp_get_wtime();
int j, k;
for (j = 0; j < repeat; j++)
{
double loc_dot = 0;
for (k = 0; k < sublength; k++)
{
double temp = sin((sum+ nstart +k +j)/(double)(n));
loc_dot += (temp * temp);
}
double dot =OMP_Allreduce_dsum(loc_dot,tid,np);
sum +=(dot/(double)(n));
}
time += omp_get_wtime();
#pragma omp master
{
ttime += omp_get_wtime();
printf("np = %d sum = %f, loop time = %f sec, total time = %f \n", np, sum, time, ttime);
}
}
return 0;
}