output results mismatch, LAPACKE_zheev() vs cusolverDnZheevd()

Question

I am translating code from legacy cblas/lapacke to cudaBLAS/cudaSOLVER and having some issues. I made a test program to get into the bottom of this. Attached is the code that I am using:

#include <stdio.h>
#include <assert.h>
#include <string.h>
#ifdef __CUDA
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include "cusolverDn.h"
#else
#include <complex.h>
#include "cblas.h"
#include "lapacke.h"
#endif  /* __CUDA */

int main()
{
    /*  INPUT MATRIX
     *
     *  0   1-i   0
     * 1+i   0   1-i
     *  0   1+i   0
     */

    int row = 3;
    int col = 3;

#ifdef __CUDA
    cuDoubleComplex input[9];
    input[0] = make_cuDoubleComplex(0.0, 0.0);
    input[1] = make_cuDoubleComplex(1.0, -1.0);
    input[2] = make_cuDoubleComplex(0.0, 0.0);
    input[3] = make_cuDoubleComplex(1.0, 1.0);
    input[4] = make_cuDoubleComplex(0.0, 0.0);
    input[5] = make_cuDoubleComplex(1.0, -1.0);
    input[6] = make_cuDoubleComplex(0.0, 0.0);
    input[7] = make_cuDoubleComplex(1.0, 1.0);
    input[8] = make_cuDoubleComplex(0.0, 0.0);

    cublasHandle_t blasHandle;
    cublasCreate(&blasHandle);  //  initialize  CUBLAS  context

    cusolverDnHandle_t solverHandle;
    cusolverStatus_t statusSolver = cusolverDnCreate(&solverHandle);
    assert(CUSOLVER_STATUS_SUCCESS == statusSolver);

    const cuDoubleComplex cmplx_1 = make_cuDoubleComplex(1.0, 0.0);
    const cuDoubleComplex cmplx_0 = make_cuDoubleComplex(0.0, 0.0);

    //input host/device matrix
    cuDoubleComplex *inputMatrix;
    cudaError_t cudaStat = cudaMallocManaged((void**)&inputMatrix, sizeof(cuDoubleComplex)*row*col, cudaMemAttachGlobal);
    assert(cudaSuccess == cudaStat);

    //ROW MAJOR TO COL MAJOR
    cuDoubleComplex * inputMatrixClone;
    cudaMallocManaged((void**)&inputMatrixClone, sizeof(cuDoubleComplex)*row*col, cudaMemAttachGlobal);
    memcpy(inputMatrixClone, input, sizeof(cuDoubleComplex)*row*col);
    cublasStatus_t status = cublasZgeam(blasHandle, CUBLAS_OP_T, CUBLAS_OP_N, row, col, &cmplx_1, inputMatrixClone, row, &cmplx_0, inputMatrixClone, row, inputMatrix, row );
    assert(CUBLAS_STATUS_SUCCESS == status);
    cudaFree(inputMatrixClone);
    // done row major to col major conversion

    int  lwork = 0;
    int *devInfo = NULL;
    double* eig;
    cuDoubleComplex *d_work = NULL;

    cudaStat = cudaMallocManaged((void**)&eig, sizeof(double)*row, cudaMemAttachGlobal);
    assert(cudaSuccess == cudaStat);

    cudaStat = cudaMallocManaged((void**)&devInfo, sizeof(int), cudaMemAttachGlobal);
    assert(cudaSuccess == cudaStat);

    cusolverStatus_t cusolver_status = cusolverDnZheevd_bufferSize(solverHandle, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, row, inputMatrix, col, eig, &lwork);
    assert (cusolver_status == CUSOLVER_STATUS_SUCCESS);

    cudaStat = cudaMallocManaged((void**)&d_work, sizeof(cuDoubleComplex)*lwork, cudaMemAttachGlobal);
    assert(cudaSuccess == cudaStat);

    cusolver_status = cusolverDnZheevd(solverHandle, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, row, inputMatrix, col, eig, d_work, lwork, devInfo);
    cudaDeviceSynchronize();

    assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);

    FILE *fp;
    fp = fopen("with_cuda.txt", "w+");

    //eigenvalues
    for (int i=0; i<row; i++)
        fprintf(fp, "eig[%d] = %0.16f\n", i, eig[i]);

    //eigenvectors
    for(int i=0; i<row*col; i++)
        fprintf(fp, "eiv[%d] = %0.16f, %0.16f\n", i, cuCreal(inputMatrix[i]), cuCimag(inputMatrix[i]));

    fclose(fp);


    cudaFree(inputMatrix);
    cudaFree(eig);
    cudaFree(devInfo);
    cudaFree(d_work);

    cusolverDnDestroy(solverHandle);
    cublasDestroy(blasHandle);  //  destroy  CUBLAS  context
    cudaDeviceReset();

#else

    double complex input[9];
    input[0] = 0;
    input[1] = 1 - _Complex_I;
    input[2] = 0;
    input[3] = 1 + _Complex_I;
    input[4] = 0;
    input[5] = 1 - _Complex_I;
    input[6] = 0;
    input[7] = 1 + _Complex_I;
    input[8] = 0;

    double eig[3];

    LAPACKE_zheev(LAPACK_ROW_MAJOR, 'V', 'U', row, input, col, eig);
    //'V':  Compute eigenvalues and eigenvectors. 'U':  Upper triangle of input;

    FILE *fp;
    fp = fopen("without_cuda.txt", "w+");

    //eigenvalues
    for (int i=0; i<row; i++)
        fprintf(fp, "eig[%d] = %0.16f\n", i, eig[i]);

    //eigenvectors
    for(int i=0; i<row*col; i++)
        fprintf(fp, "eiv[%d] = %0.16f, %0.16f\n", i, creal(input[i]), cimag(input[i]));

    fclose(fp);

#endif

    return 0;
}

program can be compiled using #define __CUDA to get a cuda build or with without #define __CUDA to get a non-cuda build. non-cuda build is giving me the following output:

eig[0] = -2.0000000000000000
eig[1] = -0.0000000000000002
eig[2] = 2.0000000000000000
eiv[0] = 0.0000000000000001, 0.5000000000000000
eiv[1] = 0.0000000000000002, 0.7071067811865476
eiv[2] = -0.0000000000000001, -0.5000000000000000
eiv[3] = 0.4999999999999998, -0.4999999999999999
eiv[4] = 0.0000000000000000, -0.0000000000000001
eiv[5] = 0.4999999999999998, -0.4999999999999999
eiv[6] = -0.4999999999999999, 0.0000000000000000
eiv[7] = 0.7071067811865475, 0.0000000000000000
eiv[8] = 0.4999999999999999, 0.0000000000000000

cuda build is giving me the following output:

eig[0] = -2.0000000000000000
eig[1] = 0.0000000000000002
eig[2] = 2.0000000000000000
eiv[0] = 0.0000000000000001, 0.5000000000000000
eiv[1] = 0.4999999999999997, -0.4999999999999998
eiv[2] = -0.4999999999999999, 0.0000000000000000
eiv[3] = 0.0000000000000001, 0.7071067811865476
eiv[4] = -0.0000000000000000, 0.0000000000000000
eiv[5] = 0.7071067811865475, 0.0000000000000000
eiv[6] = 0.0000000000000001, 0.5000000000000000
eiv[7] = -0.4999999999999998, 0.4999999999999999
eiv[8] = -0.4999999999999999, 0.0000000000000000

Can anybody shed some light on this issue please why I am getting different result mainly the eigenvectors? eigenvalues order also seems reverse. Why is that?

score 0 · Answer 1 · answered Feb 24 '21 at 14:25

The eigenvalues look the same to me, ordered from lower to higher (-2, 0 and 2).

The eigenvectors that you get with LAPACK (row-major) are:

v_1 = (i/2 , (1-i)/2 , -1/2)
v_2 = (sqrt(2)i/2 , 0 , sqrt(2)/2)
v_3 = (i/2 , (-1+i)/2 , -1/2)

From CUDA (col-major), only the last one is different - and by a global minus sign. (i.e., v'_3 = (-i/2 , -(-1+i)/2 , +1/2) ). This is also an eigenvector of the matrix, with the same eigenvalue (2). So, I'd say that both are correct.

output results mismatch, LAPACKE_zheev() vs cusolverDnZheevd()

1 Answers1