There are a bunch of questions that ask "why is my parallel loop slower than the sequential one" to which answer is "the work needed inside the loop is low; try to make more iterations", like this one.
I have a loop that each iteration takes about 0.5 minute. I thought that this loop is "heavy" enough that any overhead associated with threading would be insignificant. Turns out that the parallel execution is slower. My code is in C and I am using OpenMP to parallelize. The structure of my code is as follows:
int main() {
malloc and calculate group G1 of arrays;
#pragma omp parallel
{
printf("avaible threads: %i\n", omp_get_num_threads());
malloc group G2 of arrays;
#pragma omp for
for (int i = 0; i < N; i++) {
calculate G2 arrays' elements through numerical integration;
do some linear algebra with G1 and G2 to assemble the system Ax=b;
solve Ax = b;
}
}
return 0;
}
Some clarifications:
- Group G1 of arrays are not modified inside the loop, they are used as helper variables
- The order in which the iterations are executed does not matter
- Group G1 requires about 0.6 GiB of memory
- Group G2 requires about 0.8 GiB of memory per thread
- All linear algebra is done using Intel MKL (with sequential threading layer, see its link advisor)
For N=6, the serial execution takes 3 minutes, while the parallel (3 threads) takes 4 minutes. For N=30, serial is 15 minutes and parallel (3 threads) 17 minutes.
I am trying to formulate an hypothesis to why this is happening. Maybe it has something to do with CPU cache and the size of my arrays?
Here is some info of the computer I am using:
Linux 4.15.0-74-generic #84-Ubuntu
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 4
On-line CPU(s) list: 0-3
Thread(s) per core: 1
Core(s) per socket: 4
Socket(s): 1
NUMA node(s): 1
Vendor ID: GenuineIntel
CPU family: 6
Model: 58
Model name: Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz
Stepping: 9
CPU MHz: 3392.363
CPU max MHz: 3400,0000
CPU min MHz: 1600,0000
L1d cache: 32K
L1i cache: 32K
L2 cache: 256K
L3 cache: 8192K
NUMA node0 CPU(s): 0-3
An actual example
The code below demonstrates what I am doing. I tried to make it as simple and small as possible. It depends on Netlib's LAPACK and OpenBLAS (built with USE_OPENMP=1). My actual programs has thousands of lines.
To compile:
gcc -O2 -m64 -fopenmp test.c -lopenblas -llapack -lpthread -lm
To run:
export OMP_NUM_THREADS=3
./a.out 30
// test.c
#include <complex.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <omp.h>
// LAPACK routines
extern void
zgesv_ (int *n, int *nrhs, _Complex double *a, int *lda, int* ipiv,
_Complex double *b, int *ldb, int* info);
extern void
zgemv_ (char *trans, int *m, int *n, _Complex double *alpha, _Complex double *a,
int *lda, _Complex double *x, int *incx, _Complex double *beta,
_Complex double *y, int *incy);
int
main (int argc, char **argv)
{
srand(300);
if (argc != 2) {
printf("must have 1 argument: number of iterations!\n");
exit(1);
}
const int nf = atoi(argv[1]);
printf("loop will have %i iterations\n", nf);
clock_t begin, end;
double time_spent;
begin = clock();
int ne = 2296;
int ne2 = ne * ne;
_Complex double* restrict pot = malloc(ne2 * sizeof(_Complex double));
for (int i = 0; i < ne; i++) {
for (int k = i; k < ne; k++) {
pot[i * ne + k] = (double) rand() / RAND_MAX;
pot[i * ne + k] *= I;
pot[i * ne + k] += (double) rand() / RAND_MAX;
pot[k * ne + i] = pot[i * ne + k];
}
}
char trans = 'N';
_Complex double one = 1.0;
_Complex double zero = 0.0;
#pragma omp parallel
{
int n = ne;
int ipiv[n]; //pivot indices
int info;
int nrhs = 1;
#pragma omp single
{
printf("avaible threads: %i\n", omp_get_num_threads());
}
_Complex double* restrict zl = malloc(ne2 * sizeof(_Complex double));
_Complex double* restrict ie = malloc(ne2 * sizeof(_Complex double));
_Complex double gpr;
#pragma omp for
for (int i = 0; i < nf; i++) {
printf("i = %i from thread %d\n", i, omp_get_thread_num());
for (int m = 0; m < ne; m++) {
for (int k = m; k < ne; k++) {
gpr = cexp(k - m);
zl[m * ne + k] = gpr * pot[m * ne + k];
zl[k * ne + m] = zl[m * ne + k];
}
}
ie[0] = 1.0;
for (int m = 1; m < ne; m++) {
ie[m] = 0.0;
}
zgesv_(&n, &nrhs, zl, &n, ipiv, ie, &n, &info);
// Check for the exact singularity
if (info > 0) {
printf("The diagonal element of the triangular factor of ZL,\n");
printf("U(%i,%i) is zero, so that ZL is singular;\n", info, info);
printf("the solution could not be computed.\n");
}
zgemv_(&trans, &ne, &ne, &one, zl, &ne, ie, &nrhs, &zero, ie, &nrhs);
for (int p = 0; p < ne2; p++) {
gpr = 0.0;
for (int m = 0; m < ne; m++) {
gpr += ie[m] * cexp(-m * 5.4) / 4.1;
}
}
}
free(zl);
free(ie);
}
free(pot);
end = clock();
time_spent = (double) (end - begin) / CLOCKS_PER_SEC;
printf("Success. Elapsed time: %.2f minutes\n", time_spent / 60.0);
return 0;
}