I m trying to do multi-thread programming on CPU using OpenMP. I have lots of for loops which are good candidate to be parallel. I attached here a part of my code. when I use first #pragma omp parallel for reduction, my code is faster, but when I try to use the same command to parallelize other loops it gets slower. does anyone have any idea why it is like this?
.
.
.
omp_set_dynamic(0);
omp_set_num_threads(4);
float *h1=new float[nvi];
float *h2=new float[npi];
while(tol>0.001)
{
std::fill_n(h2, npi, 0);
int k,i;
float h222=0;
#pragma omp parallel for private(i,k) reduction (+: h222)
for (i=0;i<npi;++i)
{
int p1=ppi[i];
int m = frombus[p1];
for (k=0;k<N;++k)
{
h222 += v[m-1]*v[k]*(G[m-1][k]*cos(del[m-1]-del[k])
+ B[m-1][k]*sin(del[m-1]-del[k]));
}
h2[i]=h222;
}
//*********** h3*****************
std::fill_n(h3, nqi, 0);
float h333=0;
#pragma omp parallel for private(i,k) reduction (+: h333)
for (int i=0;i<nqi;++i)
{
int q1=qi[i];
int m = frombus[q1];
for (int k=0;k<N;++k)
{
h333 += v[m-1]*v[k]*(G[m-1][k]*sin(del[m-1]-del[k])
- B[m-1][k]*cos(del[m-1]-del[k]));
}
h3[i]=h333;
}
.
.
.
}