Here is my openMP code to solve a tridiagonal system by applying parallel cyclic reduction. I have taken 3 systems of size 256000, 512000 and 1024000, and solve them using the number of threads 1, 2, 4, 8, 16 and 24, respectively, on a Intel Xeon E5-2670 v3 processor.
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include<omp.h>
int num_sys = 1;
void data_construct(double *h_b, double *h_c, double *h_d, double *h_y, int size)
{
int i, j;
for(j = 0; j<num_sys; j++)
for(i = 0; i<size; i++)
{
int id = j*size + i;
h_b[id] = 2.0; h_c[id] = 8.0; h_d[id] = 2.0; h_y[id] = 8.0;
if(i == 0)
h_b[id] = 0.0;
if(i == size-1)
h_d[id] = 0.0;
}
}
void PCR_Tri_openMP(double *b, double *c, double *d, double *y, double *x, int steps, int n)
{
double *bb, *cc, *dd, *yy;
bb = (double*)malloc(n*num_sys*sizeof(double)); cc = (double*)malloc(n*num_sys*sizeof(double));
dd = (double*)malloc(n*num_sys*sizeof(double)); yy = (double*)malloc(n*num_sys*sizeof(double));
int i, k, iLeft, iRight;
double alfa, beta; int delta = 1;
for(k = 0; k<steps; k++) // reduction loop
{
if(k != 0) //from lavel 1 updated data is copying back to original memory location before starting the reduction
{
#pragma omp parallel for
for(i = 0; i<n*num_sys; i++) // each thread copying
{
b[i] = bb[i]; c[i] = cc[i]; d[i] = dd[i]; y[i] = yy[i];
}
}
#pragma omp parallel for private(iLeft, iRight, alfa, beta)
for(i = 0; i<n*num_sys; i++) // each thread is performing k-th level reduction
{
iLeft = i - delta; iRight = i + delta;
if(i%n - delta < 0) iLeft = (i/n)*(n);
if(i%n + delta >= n) iRight = (i/n + 1)*n - 1;
alfa = b[i]/c[iLeft]; beta = d[i]/c[iRight];
cc[i] = c[i] - d[iLeft]*alfa - b[iRight]*beta;
yy[i] = y[i] - y[iLeft]*alfa - y[iRight]*beta;
bb[i] = -b[iLeft]*alfa;
dd[i] = -d[iRight]*beta;
}
delta *= 2;
}
#pragma omp parallel for
for(i = 0; i<n*num_sys; i++)
x[i] = yy[i]/cc[i];
}
int main()
{
int syssize[3] = {256000, 512000, 1024000};
int NT[6] = {1, 2, 4, 8, 16, 24};
int i, j, k, size; double t1, t2;
for(k = 0; k<3; k++)
{
size = syssize[k];
double *h_b, *h_c, *h_d, *h_x, *h_y;
h_b = (double*)malloc(num_sys*size*sizeof(double));
h_c = (double*)malloc(num_sys*size*sizeof(double));
h_d = (double*)malloc(num_sys*size*sizeof(double));
h_x = (double*)malloc(num_sys*size*sizeof(double));
h_y = (double*)malloc(num_sys*size*sizeof(double));
int step = ceil(log(size)/log(2.0));
for(i = 0; i<6; i++)
{
double sum = 0.0;
omp_set_num_threads(NT[i]);
#pragma omp parallel
{
#pragma omp single nowait
printf("number of threads = %d\n", omp_get_num_threads());
}
for(j = 0; j<10; j++)
{
data_construct(h_b, h_c, h_d, h_y, size);
t1 = omp_get_wtime();
PCR_Tri_openMP(h_b, h_c, h_d, h_y, h_x, step, size);
t2 = omp_get_wtime();
sum += t2-t1;
}
printf("%d\t %d\t %lf\n", size, NT[i], sum/10.0);
}
free(h_b); free(h_c); free(h_d); free(h_y); free(h_x);
}
return 0;
}
Here is the performance result
I do not why I lost performance for more threads, any help is really appreciated.