I am translating code from legacy cblas/lapacke to cudaBLAS/cudaSOLVER and having some issues. I made a test program to get into the bottom of this. Attached is the code that I am using:
#include <stdio.h>
#include <assert.h>
#include <string.h>
#ifdef __CUDA
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include "cusolverDn.h"
#else
#include <complex.h>
#include "cblas.h"
#include "lapacke.h"
#endif /* __CUDA */
int main()
{
/* INPUT MATRIX
*
* 0 1-i 0
* 1+i 0 1-i
* 0 1+i 0
*/
int row = 3;
int col = 3;
#ifdef __CUDA
cuDoubleComplex input[9];
input[0] = make_cuDoubleComplex(0.0, 0.0);
input[1] = make_cuDoubleComplex(1.0, -1.0);
input[2] = make_cuDoubleComplex(0.0, 0.0);
input[3] = make_cuDoubleComplex(1.0, 1.0);
input[4] = make_cuDoubleComplex(0.0, 0.0);
input[5] = make_cuDoubleComplex(1.0, -1.0);
input[6] = make_cuDoubleComplex(0.0, 0.0);
input[7] = make_cuDoubleComplex(1.0, 1.0);
input[8] = make_cuDoubleComplex(0.0, 0.0);
cublasHandle_t blasHandle;
cublasCreate(&blasHandle); // initialize CUBLAS context
cusolverDnHandle_t solverHandle;
cusolverStatus_t statusSolver = cusolverDnCreate(&solverHandle);
assert(CUSOLVER_STATUS_SUCCESS == statusSolver);
const cuDoubleComplex cmplx_1 = make_cuDoubleComplex(1.0, 0.0);
const cuDoubleComplex cmplx_0 = make_cuDoubleComplex(0.0, 0.0);
//input host/device matrix
cuDoubleComplex *inputMatrix;
cudaError_t cudaStat = cudaMallocManaged((void**)&inputMatrix, sizeof(cuDoubleComplex)*row*col, cudaMemAttachGlobal);
assert(cudaSuccess == cudaStat);
//ROW MAJOR TO COL MAJOR
cuDoubleComplex * inputMatrixClone;
cudaMallocManaged((void**)&inputMatrixClone, sizeof(cuDoubleComplex)*row*col, cudaMemAttachGlobal);
memcpy(inputMatrixClone, input, sizeof(cuDoubleComplex)*row*col);
cublasStatus_t status = cublasZgeam(blasHandle, CUBLAS_OP_T, CUBLAS_OP_N, row, col, &cmplx_1, inputMatrixClone, row, &cmplx_0, inputMatrixClone, row, inputMatrix, row );
assert(CUBLAS_STATUS_SUCCESS == status);
cudaFree(inputMatrixClone);
// done row major to col major conversion
int lwork = 0;
int *devInfo = NULL;
double* eig;
cuDoubleComplex *d_work = NULL;
cudaStat = cudaMallocManaged((void**)&eig, sizeof(double)*row, cudaMemAttachGlobal);
assert(cudaSuccess == cudaStat);
cudaStat = cudaMallocManaged((void**)&devInfo, sizeof(int), cudaMemAttachGlobal);
assert(cudaSuccess == cudaStat);
cusolverStatus_t cusolver_status = cusolverDnZheevd_bufferSize(solverHandle, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, row, inputMatrix, col, eig, &lwork);
assert (cusolver_status == CUSOLVER_STATUS_SUCCESS);
cudaStat = cudaMallocManaged((void**)&d_work, sizeof(cuDoubleComplex)*lwork, cudaMemAttachGlobal);
assert(cudaSuccess == cudaStat);
cusolver_status = cusolverDnZheevd(solverHandle, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, row, inputMatrix, col, eig, d_work, lwork, devInfo);
cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
FILE *fp;
fp = fopen("with_cuda.txt", "w+");
//eigenvalues
for (int i=0; i<row; i++)
fprintf(fp, "eig[%d] = %0.16f\n", i, eig[i]);
//eigenvectors
for(int i=0; i<row*col; i++)
fprintf(fp, "eiv[%d] = %0.16f, %0.16f\n", i, cuCreal(inputMatrix[i]), cuCimag(inputMatrix[i]));
fclose(fp);
cudaFree(inputMatrix);
cudaFree(eig);
cudaFree(devInfo);
cudaFree(d_work);
cusolverDnDestroy(solverHandle);
cublasDestroy(blasHandle); // destroy CUBLAS context
cudaDeviceReset();
#else
double complex input[9];
input[0] = 0;
input[1] = 1 - _Complex_I;
input[2] = 0;
input[3] = 1 + _Complex_I;
input[4] = 0;
input[5] = 1 - _Complex_I;
input[6] = 0;
input[7] = 1 + _Complex_I;
input[8] = 0;
double eig[3];
LAPACKE_zheev(LAPACK_ROW_MAJOR, 'V', 'U', row, input, col, eig);
//'V': Compute eigenvalues and eigenvectors. 'U': Upper triangle of input;
FILE *fp;
fp = fopen("without_cuda.txt", "w+");
//eigenvalues
for (int i=0; i<row; i++)
fprintf(fp, "eig[%d] = %0.16f\n", i, eig[i]);
//eigenvectors
for(int i=0; i<row*col; i++)
fprintf(fp, "eiv[%d] = %0.16f, %0.16f\n", i, creal(input[i]), cimag(input[i]));
fclose(fp);
#endif
return 0;
}
program can be compiled using #define __CUDA to get a cuda build or with without #define __CUDA to get a non-cuda build. non-cuda build is giving me the following output:
eig[0] = -2.0000000000000000
eig[1] = -0.0000000000000002
eig[2] = 2.0000000000000000
eiv[0] = 0.0000000000000001, 0.5000000000000000
eiv[1] = 0.0000000000000002, 0.7071067811865476
eiv[2] = -0.0000000000000001, -0.5000000000000000
eiv[3] = 0.4999999999999998, -0.4999999999999999
eiv[4] = 0.0000000000000000, -0.0000000000000001
eiv[5] = 0.4999999999999998, -0.4999999999999999
eiv[6] = -0.4999999999999999, 0.0000000000000000
eiv[7] = 0.7071067811865475, 0.0000000000000000
eiv[8] = 0.4999999999999999, 0.0000000000000000
cuda build is giving me the following output:
eig[0] = -2.0000000000000000
eig[1] = 0.0000000000000002
eig[2] = 2.0000000000000000
eiv[0] = 0.0000000000000001, 0.5000000000000000
eiv[1] = 0.4999999999999997, -0.4999999999999998
eiv[2] = -0.4999999999999999, 0.0000000000000000
eiv[3] = 0.0000000000000001, 0.7071067811865476
eiv[4] = -0.0000000000000000, 0.0000000000000000
eiv[5] = 0.7071067811865475, 0.0000000000000000
eiv[6] = 0.0000000000000001, 0.5000000000000000
eiv[7] = -0.4999999999999998, 0.4999999999999999
eiv[8] = -0.4999999999999999, 0.0000000000000000
Can anybody shed some light on this issue please why I am getting different result mainly the eigenvectors? eigenvalues order also seems reverse. Why is that?