Not able to get the cublasSgelsbatched function to work

Question

I'm currently trying to get the cublasSgelsbatched (https://docs.nvidia.com/cuda/cublas/index.html) version to work. I started by first making a small test case to see what parameters are needed exactly and how they need to be inputted. However after much trial and error I still can't get it to work, I get a status return of 13, which corresponds to CUBLAS_STATUS_EXECUTION_FAILED which is a very vague error, also I tried some other cublas testcases and they seem to be working fine. I also tested the input matrix in MATlab, which does have a LS solution.

#include "stdafx.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"

#include <algorithm>
#include <cmath>
#include <Windows.h>

int main()
{
    //init id, handle and stat
    int id = cudaGetDevice(&id);
    cublasHandle_t m_cuBLAS;
    cublasStatus_t stat;

    // create handle
    stat = cublasCreate(&m_cuBLAS);

    //params
    const int C = 3; 
    const int M = 2;
    long lda = C;
    long ldb = M;

    //init variables
    float *Amat, *Ymat, *Xmat;
    float *gAmat, *gYmat;

    //allocate mem
    Amat = (float*) malloc(M * C * sizeof(float));
    Ymat = (float*) malloc(C *  sizeof(float));
    Xmat = (float*) malloc(M *  sizeof(float));

    srand(100);

    for (int i = 0; i < C * M; i++) {
        Amat[i] = rand() % 10 + 1;
        Amat[i] = (float)Amat[i];
    }

    for (int i = 0; i < C; i++) {
        Ymat[i] =  rand() % 10 + 1;
        Ymat[i] = (float)Ymat[i];
    }

    //allocate mem
    cudaMalloc( &gAmat, M * C * sizeof(float));
    cudaMalloc( &gYmat, C * sizeof(float));

    //copy mem
    cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);

    //init info params
    int info = 0;
    int devInfoArray[1] = { 0 };

    //Synchronize (not necesarry I think, but just to test)
    cudaDeviceSynchronize();
    

    //run cublas
    cublasStatus_t status = cublasSgelsBatched(m_cuBLAS,
        CUBLAS_OP_N,
        C,
        M,
        1,
        &gAmat,
        lda, //or 1
        &gYmat,
        lda,
        &info,
        NULL,
        1);
    
    //Output info
    std::cout << "status = " << status << std::endl;
    std::cout << "info = " << info << std::endl;
    std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;

    cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);

    //Output printed
    std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;

    //free memory
    free(Amat);
    free(Ymat);
    free(Xmat);

    cudaFree(gAmat);
    cudaFree(gYmat);

    //destory handle
    cublasDestroy(m_cuBLAS);

    return 0;
}

I'm on Windows 10 running in MVS using CUDA 9.0

I'd really appreciate some help

https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-gelsBatched -- "Aarray is an array of pointers to matrices stored in column-major format". I don't see any array of pointers to matrices in your code, do you? — talonmies, May 03 '21 at 13:15

Robert Crovella · Answer 1 · 2021-05-04T13:30:04.467

As pointed out in the comments, you are not creating a proper array of pointers on the device. The batched function works with an array of pointers that lives in device memory, for the data parameters, for example:

Aarray device input/output array of pointers to array, with each array of dim. m x n with lda>=max(1,m). Matrices Aarray[i] should not overlap; otherwise, undefined behavior is expected.

Passing for example &gAmat seems to satisfy the type requirement, but that pointer does not point to device memory.

The following modifications to your code focused on proper handling of gAmat and gYmat seem to run without error for me:

$ cat t130.cu
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#include <algorithm>
#include <cmath>

int main()
{
    //init id, handle and stat
    int id = cudaGetDevice(&id);
    cublasHandle_t m_cuBLAS;
    cublasStatus_t stat;

    // create handle
    stat = cublasCreate(&m_cuBLAS);

    //params
    const int C = 3;
    const int M = 2;
    long lda = C;
    long ldb = M;

    //init variables
    float *Amat, *Ymat, *Xmat;
    float *gAmat, *gYmat;

    //allocate mem
    Amat = (float*) malloc(M * C * sizeof(float));
    Ymat = (float*) malloc(C *  sizeof(float));
    Xmat = (float*) malloc(M *  sizeof(float));

    srand(100);

    for (int i = 0; i < C * M; i++) {
        Amat[i] = rand() % 10 + 1;
        Amat[i] = (float)Amat[i];
    }

    for (int i = 0; i < C; i++) {
        Ymat[i] =  rand() % 10 + 1;
        Ymat[i] = (float)Ymat[i];
    }

    //allocate mem
    cudaMalloc( &gAmat, M * C * sizeof(float));
    cudaMalloc( &gYmat, C * sizeof(float));

    //copy mem
    cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
    float **ggAmat, **ggYmat;
    cudaMalloc(&ggAmat, sizeof(float*));
    cudaMalloc(&ggYmat, sizeof(float*));
    cudaMemcpy(ggAmat, &gAmat, sizeof(float*), cudaMemcpyHostToDevice);
    cudaMemcpy(ggYmat, &gYmat, sizeof(float*), cudaMemcpyHostToDevice);
    //init info params
    int info = 0;
    int devInfoArray[1] = { 0 };

    //Synchronize (not necesarry I think, but just to test)
    cudaDeviceSynchronize();


    //run cublas
    cublasStatus_t status = cublasSgelsBatched(m_cuBLAS,
        CUBLAS_OP_N,
        C,
        M,
        1,
        ggAmat,
        lda, //or 1
        ggYmat,
        lda,
        &info,
        NULL,
        1);

    //Output info
    std::cout << "status = " << status << std::endl;
    std::cout << "info = " << info << std::endl;
    std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;

    cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);

    //Output printed
    std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;

    //free memory
    free(Amat);
    free(Ymat);
    free(Xmat);

    cudaFree(gAmat);
    cudaFree(gYmat);

    //destory handle
    cublasDestroy(m_cuBLAS);

    return 0;
}
$ nvcc -o t130 t130.cu -lcublas
t130.cu(15): warning: variable "stat" was set but never used

t130.cu(24): warning: variable "ldb" was declared but never referenced

$ cuda-memcheck ./t130
========= CUDA-MEMCHECK
status = 0
info = 0
devInfoArray = 0
-0.0226168, 0.514827, -4.29722
========= ERROR SUMMARY: 0 errors
$

Your code only shows a single array. If you had a batch of arrays, you would pass an actual array of device-allocated pointers, for each of A and Y.

Based on comments below, here is a version of the code using non-random input:

$ cat t130.cu
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#include <algorithm>
#include <cmath>

int main()
{
    //init id, handle and stat
    int id = cudaGetDevice(&id);
    cublasHandle_t m_cuBLAS;
    cublasStatus_t status;
    // create handle
    status = cublasCreate(&m_cuBLAS);
    std::cout << "status = " << status << std::endl;

    //params
    const int C = 3;
    const int M = 2;
    long lda = C;

    //init variables
    float *Amat, *Ymat, *Xmat;
    float *gAmat, *gYmat;

    //allocate mem
    Amat = (float*) malloc(M * C * sizeof(float));
    Ymat = (float*) malloc(C *  sizeof(float));
    Xmat = (float*) malloc(M *  sizeof(float));

    srand(100);
#if 0
    for (int i = 0; i < C * M; i++) {
        Amat[i] = rand() % 10 + 1;
        Amat[i] = (float)Amat[i];
    }

    for (int i = 0; i < C; i++) {
        Ymat[i] =  rand() % 10 + 1;
        Ymat[i] = (float)Ymat[i];
    }
#endif
    Amat[0] = 6;
    Amat[1] = 7;
    Amat[2] = 6;
    Amat[3] = 5;
    Amat[4] = 5;
    Amat[5] = 5;
    Ymat[0] = 9;
    Ymat[1] = 3;
    Ymat[2] = 10;
    //allocate mem
    cudaMalloc( &gAmat, M * C * sizeof(float));
    cudaMalloc( &gYmat, C * sizeof(float));

    //copy mem
    cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
    float **ggAmat, **ggYmat;
    cudaMalloc(&ggAmat, sizeof(float*));
    cudaMalloc(&ggYmat, sizeof(float*));
    cudaMemcpy(ggAmat, &gAmat, sizeof(float*), cudaMemcpyHostToDevice);
    cudaMemcpy(ggYmat, &gYmat, sizeof(float*), cudaMemcpyHostToDevice);
    //init info params
    int info = 0;
    int devInfoArray[1] = { 0 };

    //Synchronize (not necesarry I think, but just to test)
    cudaDeviceSynchronize();


    //run cublas
    status = cublasSgelsBatched(m_cuBLAS,
        CUBLAS_OP_N,
        C,
        M,
        1,
        ggAmat,
        lda, //or 1
        ggYmat,
        lda,
        &info,
        NULL,
        1);

    //Output info
    std::cout << "status = " << status << std::endl;
    std::cout << "info = " << info << std::endl;
    std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;

    cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);

    //Output printed
    std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;

    //free memory
    free(Amat);
    free(Ymat);
    free(Xmat);

    cudaFree(gAmat);
    cudaFree(gYmat);

    //destory handle
    cublasDestroy(m_cuBLAS);

    return 0;
}
$ nvcc -o t130 t130.cu -lcublas
$ cuda-memcheck ./t130
========= CUDA-MEMCHECK
status = 0
status = 0
info = 0
devInfoArray = 0
-6.5, 9.7, 0.707106
========= ERROR SUMMARY: 0 errors
$

Thank you for your reaction, it runs now, which is a lot better than before, but when I run the same code as you I get a status 13 response, plus I get the wrong answer. Matlab tells me that the result should be [-6.5, 9.7]. With `A = [6, 7, 6, 5, 5, 5]` and `Y = [9, 3, 10]; ` Do you think this has to do with the installation then? — Taliebram, May 04 '21 at 08:34
When I run with the A and Y values you suggest, the first 2 output values I get are indeed -6.5 and 9.7 and the reported status is zero. So if you are getting an error then I think you may have a problem with your setup. If you have a new CUDA install, it's usually a good idea to verify operation using one or more of the sample codes. I've updated my answer with this test using non-random values. — Robert Crovella, May 04 '21 at 13:29

Not able to get the cublasSgelsbatched function to work

1 Answers1

Linked