LU factorization receives different results between LAPACK and cuBLAS/cuSOLVER

Question

I am testing out some scenarios where the function dgetrf is returned differently when used with cuBLAS/cuSOLVER compared to writing for LAPACK. For example, I am looking at LU factorization of the following matrix:

2.0 4.0 1.0 -3.0 0.0

-1.0 -2.0 2.0 4.0 0.0

4.0 2.0 -3.0 5.0 0.0

5.0 -4.0 -3.0 1.0 0.0

0.0 0.0 0.0 0.0 0.0

I first try to call dgetrf from cuBLAS/cuSOLVER as followed (warning, ugly testing code ahead!)

    #include <cblas.h>
    #include <time.h>
    #include <stdio.h>
    #include <string.h>
    #include <cuda_runtime.h>
    #include <cublas_v2.h>
    #include <cusolverDn.h>

    int main(int argc, char** argv){

        const int matrixSize = 5;

        int i, j;
        double arrA[matrixSize][matrixSize] = {
            {2.0, 4.0, 1.0, -3.0, 0.0},
            {-1.0, -2.0, 2.0, 4.0, 0.0},
            {4.0, 2.0, -3.0, 5.0, 0.0},
            {5.0, -4.0, -3.0, 1.0, 0.0},
            {0.0, 0.0, 0.0, 0.0, 0.0}
        };

        double *arrADev, *workArray;
        double **matrixArray;
        int *pivotArray;
        int *infoArray;
        double flat[matrixSize*matrixSize] = {0};
        cublasHandle_t cublasHandle;
        cublasStatus_t cublasStatus;
        cudaError_t error;

        cudaError cudaStatus;
        cusolverStatus_t cusolverStatus;
        cusolverDnHandle_t cusolverHandle;

        double *matrices[2];


        error = cudaMalloc(&arrADev,  sizeof(double) * matrixSize*matrixSize);
        if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));

        error = cudaMalloc(&matrixArray,  sizeof(double*) * 2);
        if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));

        error = cudaMalloc(&pivotArray,  sizeof(int) * matrixSize*matrixSize);
        if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));

        error = cudaMalloc(&infoArray,  sizeof(int) * matrixSize*matrixSize);
        if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));

        cublasStatus = cublasCreate(&cublasHandle);
        if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);

        //maps matrix to flat vector
        for(i=0; i<matrixSize; i++){
            for(j=0; j<matrixSize; j++){
                flat[i+j*matrixSize] = arrA[i][j];
            }
        }

        //copy matrix A to device
        cublasStatus = cublasSetMatrix(matrixSize, matrixSize, sizeof(double), flat, matrixSize, arrADev, matrixSize);
        if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);

        //save matrix address
        matrices[0] = arrADev;

        //copy matrices references to device
        error = cudaMemcpy(matrixArray, matrices, sizeof(double*)*1, cudaMemcpyHostToDevice);
        if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));

        int Lwork;
        // calculate buffer size for cuSOLVER LU factorization
        cusolverStatus = cusolverDnDgetrf_bufferSize(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, &Lwork);
        cudaStatus = cudaMalloc((void**)&workArray, Lwork*sizeof(double));

        // cuBLAS LU factorization
        cublasStatus = cublasDgetrfBatched(cublasHandle, matrixSize, matrixArray, matrixSize, pivotArray, infoArray, 1);
        if (cublasStatus == CUBLAS_STATUS_SUCCESS)
            printf("cuBLAS DGETRF SUCCESSFUL! \n");
        else
            printf("cuBLAS DGETRF UNSUCCESSFUL! \n");

        // cuSOLVER LU factorization
        cusolverStatus = cusolverDnCreate(&cusolverHandle);
        cusolverStatus = cusolverDnDgetrf(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, workArray, pivotArray, infoArray);
        if (cusolverStatus == CUSOLVER_STATUS_SUCCESS)
            printf("cuSOLVER DGETRF SUCCESSFUL! \n");
        else
            printf("cuSOLVER DGETRF UNSUCCESSFUL! \n");

        return 0;
    }

The output from the code above is

    cuBLAS DGETRF SUCCESSFUL!
    cuSOLVER DGETRF SUCCESSFUL!

When I try to do the same with LAPACK (warning: more ugly code!):

    #include <iostream>
    #include <vector>

    using namespace std;

    extern "C" void dgetrf_(int* dim1, int* dim2, double* a, int* lda, int* ipiv, int* info);
    extern "C" void dgetrs_(char *TRANS, int *N, int *NRHS, double *A, int *LDA, int *IPIV, double *B, int *LDB, int *INFO );

    int main()
    {
       char trans = 'N';
       int dim = 5;
       int LDA = dim;
       int info;

       vector<double> a,b;

       a.push_back(2.0); a.push_back(4.0); a.push_back(1.0); a.push_back(-3.0); a.push_back(0.0);
       a.push_back(-1.0); a.push_back(-2.0); a.push_back(2.0); a.push_back(4.0); a.push_back(0.0);
       a.push_back(4.0); a.push_back(2.0); a.push_back(-3.0); a.push_back(5.0); a.push_back(0.0);
       a.push_back(5.0); a.push_back(-4.0); a.push_back(-3.0); a.push_back(1.0); a.push_back(0.0);
       a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0);

       int ipiv[5];
       dgetrf_(&dim, &dim, &*a.begin(), &LDA, ipiv, &info);
       if (info == 0)
           printf("dgetrf successful\n");
       else
           printf("dgetrf unsuccessful\n");

       return 0;
    }

The output I get is

    dgetrf unsuccessful

I understand that they are different libraries, but is this behaviour expected?

@talonmies how is that the case? did I accidentally transpose it? — Quang Thinh Ha, Nov 11 '19 at 13:41
I presume you accidentally did not transpose it in the lapack case. In the CUDA code, you write the matrix into a linear array in column major order, which is correct for blas/lapack. But in the CPU code, you push the matrix into the vector in row major order. So the CPU code is using the transpose of the GPU code, I think — talonmies, Nov 11 '19 at 13:47
Thanks for the suggestion. Even after trying to transpose it I still get the same unsuccessful error. `info` return error code 5. — Quang Thinh Ha, Nov 11 '19 at 13:51
Well you have a zero diagonal element which will probably blow up things — talonmies, Nov 11 '19 at 15:37
Which should be the case for cuBLAS/cuSOLVER as well... or so I expected... — Quang Thinh Ha, Nov 11 '19 at 15:38

score 2 · Accepted Answer · answered Nov 11 '19 at 15:48

When I compile your CUDA code, I get a warning that the cusolver handle is being used before its value is set. You shouldn't ignore such warnings, because your usage in the sizing function is not correct. However that is not the problem here.

I don't think there's any difference between your two test cases. You seem to be interpreting the results incorrectly.

Looking at the netlib documentation, we see that an info value of 5 mean U(5,5) is zero, which would be problematic for future use. That doesn't mean the dgetrf factorization was successful or unsuccessful as you are printing out, but instead it means something about your input data. In fact the factorization was completed, as clearly indicated in the docs.

Likewise, we get no information about that condition simply by looking at the function return value for the cusolver function. In order to discover information similar to what is being reported by lapack, its necessary to look at the infoArray values.

With those changes, your codes are reporting the same thing (info value of 5):

$ cat t1556.cu
    #include <time.h>
    #include <stdio.h>
    #include <string.h>
    #include <cuda_runtime.h>
    #include <cublas_v2.h>
    #include <cusolverDn.h>

    int main(int argc, char** argv){

        const int matrixSize = 5;

        int i, j;
        double arrA[matrixSize][matrixSize] = {
            {2.0, 4.0, 1.0, -3.0, 0.0},
            {-1.0, -2.0, 2.0, 4.0, 0.0},
            {4.0, 2.0, -3.0, 5.0, 0.0},
            {5.0, -4.0, -3.0, 1.0, 0.0},
            {0.0, 0.0, 0.0, 0.0, 0.0}
        };

        double *arrADev, *workArray;
        double **matrixArray;
        int *pivotArray;
        int *infoArray;
        double flat[matrixSize*matrixSize] = {0};
        cublasHandle_t cublasHandle;
        cublasStatus_t cublasStatus;
        cudaError_t error;

        cudaError cudaStatus;
        cusolverStatus_t cusolverStatus;
        cusolverDnHandle_t cusolverHandle;

        double *matrices[2];


        error = cudaMalloc(&arrADev,  sizeof(double) * matrixSize*matrixSize);
        if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));

        error = cudaMalloc(&matrixArray,  sizeof(double*) * 2);
        if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));

        error = cudaMalloc(&pivotArray,  sizeof(int) * matrixSize*matrixSize);
        if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));

        error = cudaMalloc(&infoArray,  sizeof(int) * matrixSize*matrixSize);
        if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));

        cublasStatus = cublasCreate(&cublasHandle);
        if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);

        //maps matrix to flat vector
        for(i=0; i<matrixSize; i++){
            for(j=0; j<matrixSize; j++){
                flat[i+j*matrixSize] = arrA[i][j];
            }
        }

        //copy matrix A to device
        cublasStatus = cublasSetMatrix(matrixSize, matrixSize, sizeof(double), flat, matrixSize, arrADev, matrixSize);
        if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);

        //save matrix address
        matrices[0] = arrADev;

        //copy matrices references to device
        error = cudaMemcpy(matrixArray, matrices, sizeof(double*)*1, cudaMemcpyHostToDevice);
        if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));

        int Lwork;
        // calculate buffer size for cuSOLVER LU factorization
        cusolverStatus = cusolverDnCreate(&cusolverHandle);
        cusolverStatus = cusolverDnDgetrf_bufferSize(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, &Lwork);
        cudaStatus = cudaMalloc((void**)&workArray, Lwork*sizeof(double));

        // cuBLAS LU factorization
        cublasStatus = cublasDgetrfBatched(cublasHandle, matrixSize, matrixArray, matrixSize, pivotArray, infoArray, 1);
        if (cublasStatus == CUBLAS_STATUS_SUCCESS)
            printf("cuBLAS DGETRF SUCCESSFUL! \n");
        else
            printf("cuBLAS DGETRF UNSUCCESSFUL! \n");

        // cuSOLVER LU factorization
        cusolverStatus = cusolverDnDgetrf(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, workArray, pivotArray, infoArray);
        if (cusolverStatus == CUSOLVER_STATUS_SUCCESS)
            printf("cuSOLVER DGETRF SUCCESSFUL! \n");
        else
            printf("cuSOLVER DGETRF UNSUCCESSFUL! \n");
        int *hinfoArray = (int *)malloc(matrixSize*matrixSize*sizeof(int));
        cudaMemcpy(hinfoArray, infoArray, matrixSize*matrixSize*sizeof(int), cudaMemcpyDeviceToHost);
        for (int i = 0; i < matrixSize*matrixSize; i++) printf("%d,", hinfoArray[i]);
        printf("\n");
        return 0;
    }
$ nvcc -o t1556 t1556.cu -lcublas -lcusolver
t1556.cu(30): warning: variable "cudaStatus" was set but never used

$ ./t1556
cuBLAS DGETRF SUCCESSFUL!
cuSOLVER DGETRF SUCCESSFUL!
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
$ cat t1557.cpp
    #include <iostream>
    #include <vector>
    #include <lapacke/lapacke.h>
    using namespace std;

//    extern "C" void dgetrf_(int* dim1, int* dim2, double* a, int* lda, int* ipiv, int* info);
//    extern "C" void dgetrs_(char *TRANS, int *N, int *NRHS, double *A, int *LDA, int *IPIV, double *B, int *LDB, int *INFO );

    int main()
    {
       char trans = 'N';
       int dim = 5;
       int LDA = dim;
       int info;

       vector<double> a,b;

       a.push_back(2.0); a.push_back(4.0); a.push_back(1.0); a.push_back(-3.0); a.push_back(0.0);
       a.push_back(-1.0); a.push_back(-2.0); a.push_back(2.0); a.push_back(4.0); a.push_back(0.0);
       a.push_back(4.0); a.push_back(2.0); a.push_back(-3.0); a.push_back(5.0); a.push_back(0.0);
       a.push_back(5.0); a.push_back(-4.0); a.push_back(-3.0); a.push_back(1.0); a.push_back(0.0);
       a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0);

       int ipiv[5];
       LAPACK_dgetrf(&dim, &dim, &*a.begin(), &LDA, ipiv, &info);
       printf("info = %d\n", info);
       if (info == 0)
           printf("dgetrf successful\n");
       else
           printf("dgetrf unsuccessful\n");

       return 0;
    }
$ g++ t1557.cpp -o t1557 -llapack
$ ./t1557
info = 5
dgetrf unsuccessful
$

I'm using the lapack installed by centOS.

centOS 7, CUDA 10.1.243, Tesla V100.

LU factorization receives different results between LAPACK and cuBLAS/cuSOLVER

1 Answers1