I wrote a C++ code whose bottleneck is the diagonalization of a possibly large symmetric matrix. The code uses OpenMP, CBLAS and LAPACKE C-interfaces.
However, the call on dsyev
runs on a single thread both on my local machine and on a HPC cluster (as seen by htop
or equivalent tools). It takes about 800 seconds to diagonalize a 12000x12000
matrix, while NumPy
's function eigh
takes about 250 seconds. Of course, in both cases $OMP_NUM_THREADS
is set to the number of threads.
Here is an example of a C++ code that calls LAPACK
that is basically what I do in my program. (I am reading the matrix that is in binary format).
#include <stdlib.h>
#include <cstring>
#include <iostream>
#include <fstream>
#include <sstream>
#include <array>
#include <chrono>
#include <omp.h>
#include <cblas.h>
extern "C"
{
#include <lapacke.h>
}
using namespace std;
const char uplo = 'L';
const char jobz = 'V';
int eigen(double* M, double* W, const int& N)
{
printf("Diagonalizing...\n");
int info;
// Eigenvalues will be in W, Eigenvectors in T
info = LAPACKE_dsyev(LAPACK_COL_MAJOR, jobz, uplo, N, M, N, W);
if (info < 0)
{
printf("FATAL ERROR: Eigensolver (dsyev) failed: illegal argument %d\n",-info);
}
else if (info > 0)
{
printf("FATAL ERROR: Eigensolver (dsyev) failed: failed to converge %d\n",info);
}
return info;
}
int main(int argc, char* argv[])
{
#ifdef _OPENMP
if (getenv("OMP_NUM_THREADS"))
{
omp_set_num_threads(atoi(getenv("OMP_NUM_THREADS")));
string str = "Running on "+to_string(omp_get_max_threads()) + " threads\n";
printf(str.c_str());
}
#endif
const int N = 12000;
double t1 = 0.;
double t2 = 0.;
double* M = new double[N*N]{0.};
double* W = new double[N]{0.};
int info = 0;
printf("Reading matrix.dat ...\n");
ifstream file("matrix.dat");
file.seekg(0, ios_base::end);
size_t size = file.tellg();
file.seekg(0, ios_base::beg);
file.read((char*) &M[0], size);
printf("Done.\n");
t1 = omp_get_wtime();
info = eigen(M,W,N);
printf("Exit code: %i\n",info);
t2 = omp_get_wtime();
printf("Done in (s): %-10.3f\n",t2-t1);
delete [] M;
delete [] W;
return 0;
}
Clearly, NumPy
seems to be linked to a version of LAPACK
that is multi-threaded while my program is not. I thought that any LAPACK
implementation was multi-threaded in the sense that it calls BLAS
, which is (supposedly) always multi-threaded.
On my local machine, np.show_config()
gives the following:
blas_mkl_info:
libraries = ['mkl_rt', 'pthread']
library_dirs = ['/opt/miniconda3/lib']
define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
include_dirs = ['/opt/miniconda3/include']
blas_opt_info:
libraries = ['mkl_rt', 'pthread']
library_dirs = ['/opt/miniconda3/lib']
define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
include_dirs = ['/opt/miniconda3/include']
lapack_mkl_info:
libraries = ['mkl_rt', 'pthread']
library_dirs = ['/opt/miniconda3/lib']
define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
include_dirs = ['/opt/miniconda3/include']
lapack_opt_info:
libraries = ['mkl_rt', 'pthread']
library_dirs = ['/opt/miniconda3/lib']
define_macros = [('SCIPY_MKL_H', None), ('HAVE_CBLAS', None)]
include_dirs = ['/opt/miniconda3/include']
While I compiled my C++ code to the LAPACK
and BLAS
libraries located in /usr/local/opt/lapack/lib
and /usr/local/opt/openblas/lib
etc...
How to fix this problem i.e. how to make sure that LAPACK
uses all threads ?