1

I am trying to write some c code to find all the eigenvalues of large matrices using the pzheevd routine from scalapack. I have the following simple example which has hard coded a simple 4x4 matrix. Using a single process, 2 processes or 4 I get the correct eigenvalues (-2.0396,-2, 2, 2.0396). However using an incommensurate numbers like 3 the eigenvalues returned are incorrect, even though it looks like all the matrix elements are assigned correctly.

To build the code use:

mpicc -g test.c -llapack -o test -lblacs-openmpi -lblacsCinit-openmpi  -L/usr/local/lib -lscalapack -lgfortran -lm -llapack -lblas

Example that works:

$ mpirun -n 1 ./test
Info: 0
Eigenvalues: -2.039608 -2.000000 2.000000 2.039608

and the one that doesn't:

$ mpirun -n 3 ./test
Info: 0
Eigenvalues: -2.223729 -1.805190 2.003994 2.024926 

And the code:

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include "mpi.h"

typedef struct complex16{double dr,di;} complex16;

extern void Cblacs_get(int context, int what, int *val);
extern void Cblacs_gridinit(int* context, char* order,
                            int nproc_rows, int nproc_cols);
extern void Cblacs_pcoord(int context, int p,
                          int* my_proc_row, int* my_proc_col);
extern void Cblacs_exit(int doneflag);
extern void descinit_(int* descrip, int* m, int* n,
                      int* row_block_size, int* col_block_size,
                      int* first_proc_row, int* first_proc_col,
                      int* blacs_grid, int* leading_dim,
                      int* error_info);
extern int numroc_(int* order, int* block_size, 
                   int* my_process_row_or_col, int* first_process_row_or_col,
                   int* nproc_rows_or_cols);
extern void pzheevd_(char *jobz, char *uplo, int *n, complex16 *a, int *ia, int *ja, int *desca, double *w, complex16 *z, int *iz, int *jz, int *descz, complex16 *work, int *lwork, double *rwork, int *lrwork, int *iwork, int *liwork, int *info);

main(int argc, char** argv) {
  int   my_rank, size, m, n;
  int   row_block_size=1, col_block_size=1;
  int nproc_rows, nproc_cols;
  int my_process_row, my_process_col;
  int   blacs_grid;
  int first_proc_row = 0, first_proc_col = 0;
  int descrip[9], info, nlocal_rows, nlocal_cols;
  int i,j;
  int leading_dim;
  m=4; n=4;

  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &size);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

  nproc_rows = sqrt(size);
  nproc_cols = size/nproc_rows;

  Cblacs_get(0, 0, &blacs_grid);
  Cblacs_gridinit(&blacs_grid, "R", nproc_rows, nproc_cols);
  Cblacs_pcoord(blacs_grid, my_rank, &my_process_row,&my_process_col);

  nlocal_rows = numroc_(&m, &row_block_size, &my_process_row, &first_proc_row, &nproc_rows);
  nlocal_cols = numroc_(&n, &col_block_size, &my_process_col, &first_proc_col, &nproc_cols);
  leading_dim = numroc_(&m, &col_block_size, &my_process_row, &first_proc_col, &nproc_rows);
  descinit_(descrip, &m, &n, &row_block_size, &col_block_size, &first_proc_row, &first_proc_col, &blacs_grid, &leading_dim, &info);

  complex16 *a, *z;
  double *w;
  a = (complex16*)malloc(nlocal_rows * nlocal_cols * sizeof(complex16));
  z = (complex16*)malloc(nlocal_rows * nlocal_cols * sizeof(complex16));
  w = (double*)malloc(n * sizeof(double));

  double *mat_els;
  mat_els = (double *)malloc(n*m * sizeof(double));
  mat_els[0] = -2.0;mat_els[1]=-0.2; mat_els[2] = -0.2; mat_els[3] = 0.0;
  mat_els[4] = -0.2;mat_els[5]=2.0; mat_els[6] = 0.0; mat_els[7] = -0.2;
  mat_els[8] = -0.2;mat_els[9]=0.0; mat_els[10] = 2.0; mat_els[11] = -0.2;
  mat_els[12] = 0.0;mat_els[13]=-0.2; mat_els[14] = -0.2; mat_els[15] = -2.0;

  int full_row, full_col;
  for(i = 0; i < nlocal_rows; i++)
    {
      for(j = 0; j < nlocal_cols; j++)
        {
          full_row = i * nproc_rows + my_process_row;
          full_col = j * nproc_cols + my_process_col;
          a[(i*nlocal_cols + j)].dr = mat_els[full_row * m + full_col];
          a[(i*nlocal_cols + j)].di = 0.0;
        }
    }
  char jobz = 'V'; // N not implemented yet.
  char uplo = 'U';
  int ai = 1, aj = 1, zi = 1, zj = 1;

  double *rwork;
  complex16 *work;
  int *iwork;
  int lwork, lrwork, liwork;

  rwork = (double*)malloc(2 * sizeof(double));
  work = (complex16*)malloc(2 * sizeof(complex16));
  iwork = (int*)malloc(2 * sizeof(int));

  lwork = -1; lrwork = -1; liwork = -1;
  pzheevd_(&jobz, &uplo, &n, a, &ai, &aj, descrip, w, z, &zi, &zj, descrip, work, &lwork, rwork, &lrwork, iwork, &liwork, &info);
  lwork = work[0].dr; lrwork = rwork[0]; liwork = iwork[0];
  free(work); free(rwork); free(iwork);

  rwork = (double*)malloc(lrwork * sizeof(double));
  work = (complex16*)malloc(lwork * sizeof(complex16));
  iwork = (int*)malloc(liwork * sizeof(int));
  pzheevd_(&jobz, &uplo, &n, a, &ai, &aj, descrip, w, z, &zi, &zj, descrip, work, &lwork, rwork, &lrwork, iwork, &liwork, &info);

  if ( my_rank == 0)
    {
      printf("Info: %d\n", info);
      printf("Eigenvalues: ");
      for(i = 0; i < n;i++)
          {
          printf("%lf ", w[i]);
          }
      printf("\n");
    }
  free(w);free(z);free(a);
  free(work);free(iwork);free(rwork);
  Cblacs_exit(1);
  MPI_Finalize();
}

1 Answers1

2

I found the solution which I should have guessed sooner. The matrix elements should be provided using the Fortran column ordered format rather than the C style row ordered format. Changing the element assignment within the for loops to the following fixes the problem and now the same eigenvalues are found for all numbers of processes (that are can form a valid grid).

a[(j*nlocal_rows + i)].dr = mat_els[full_row * m + full_col];
a[(j*nlocal_rows + i)].di = 0.0;