I am trying to speed up a simple nested loop:
for (int k = 0; k < n; k++)
for (int i = 0; i < n - k; ++i)
c[k] += a[i + k] * b[i];
first I tried to use openmp(since this loop is not well balanced, so I added a little modification)
#pragma omp parallel for
for (int k = 0; k < n/2; k++)
for (int i = 0; i < n - k; ++i){
c[k] += a[i + k] * b[i];
if(i < k+1) c[n-1-k] += a[i + n-1-k] * b[i];
}
for(int k = n/2; k < n - n/2; k++)
for (int i = 0; i < n - k; ++i)
c[k] += a[i + k] * b[i];
But the problem is it slows down compared with just adding #pragma omp parallel for
.
So I guessed probably it was related to the reuse of cache, then I tried unrolling:
#pragma omp parallel for
for (k = 0; k < n/2-7; k+=8){
for (int i = 0; i < n - k; ++i){
c[k] += a[i+k] * b[i];
if(i < n-k-1) c[k+1] += a[i+k+1] * b[i];
if(i < n-k-2) c[k+2] += a[i+k+2] * b[i];
if(i < n-k-3) c[k+3] += a[i+k+3] * b[i];
if(i < n-k-4) c[k+4] += a[i+k+4] * b[i];
if(i < n-k-5) c[k+5] += a[i+k+5] * b[i];
if(i < n-k-6) c[k+6] += a[i+k+6] * b[i];
if(i < n-k-7) c[k+7] += a[i+k+7] * b[i];
if(i < k+1) c[n-1-k] += a[i+n-1-k] * b[i];
if(i < k+2) c[n-2-k] += a[i+n-2-k] * b[i];
if(i < k+3) c[n-3-k] += a[i+n-3-k] * b[i];
if(i < k+4) c[n-4-k] += a[i+n-4-k] * b[i];
if(i < k+5) c[n-5-k] += a[i+n-5-k] * b[i];
if(i < k+6) c[n-6-k] += a[i+n-6-k] * b[i];
if(i < k+7) c[n-7-k] += a[i+n-7-k] * b[i];
if(i < k+8) c[n-8-k] += a[i+n-8-k] * b[i];
}
}
// this loop must <= 16 and is well balance
#pragma omp parallel for
for(int j = k; j < n-k; j++)
for(int i = 0; i < n - j; ++i){
c[j] += a[i + j] * b[i];
}
But...it even get worse! I just want to know why
more: I compiled it via g++-9 test.cpp -openmp -o test