1

I am trying to use Unified Memory with cudaMallocManaged() with the cuBLAS library. I am performing a simple matrix to vector multiplication as a simple example, and storing the result in an array results. However when printing the results array, I get back all 0's, instead of the results of multiplying the matrix mat by the vector vec.
The flow I am using is:

  1. allocating memory with cudaMallocManaged()
  2. Initializing the arrays with data
  3. Allocating the cuBLAS handle
  4. Calling cublasDgemv to perform the multiplication storing the results in results

When using new and then cublasSetMatrix() or cublasSetVector() this works fine.

How do I use Unified Memory with cuBLAS?

Here are minimum working examples:

Unified Memory Attempt (this gives back all 0's in results):

#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <ctime>
#include "cublas_v2.h"

#define cudaErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

static const char *cublasErrChk(cublasStatus_t error)
{
    switch (error)
    {
        case CUBLAS_STATUS_SUCCESS:
            return "CUBLAS_STATUS_SUCCESS";

        case CUBLAS_STATUS_NOT_INITIALIZED:
            return "CUBLAS_STATUS_NOT_INITIALIZED";

        case CUBLAS_STATUS_ALLOC_FAILED:
            return "CUBLAS_STATUS_ALLOC_FAILED";

        case CUBLAS_STATUS_INVALID_VALUE:
            return "CUBLAS_STATUS_INVALID_VALUE";

        case CUBLAS_STATUS_ARCH_MISMATCH:
            return "CUBLAS_STATUS_ARCH_MISMATCH";

        case CUBLAS_STATUS_MAPPING_ERROR:
            return "CUBLAS_STATUS_MAPPING_ERROR";

        case CUBLAS_STATUS_EXECUTION_FAILED:
            return "CUBLAS_STATUS_EXECUTION_FAILED";

        case CUBLAS_STATUS_INTERNAL_ERROR:
            return "CUBLAS_STATUS_INTERNAL_ERROR";
    }

    return "<unknown>";
}

int main() {

    size_t dims = 4;

    double *vec, *mat, *results;

    cudaErrChk( cudaMallocManaged(&vec, dims * sizeof(double)) );
    cudaErrChk( cudaMallocManaged(&mat, dims * dims * sizeof(double)) );
    cudaErrChk( cudaMallocManaged(&results, dims * sizeof(double)) );

    printf("Vector:\n");
    for (int i = 1; i < dims + 1; i++) {
        vec[i] = 0.5 * i;
        printf("%.2lf ", vec[i]);
    } 
    printf("\n\nMatrix:\n");

    for (int i = 1; i < dims * dims + 1; i++) {
        mat[i] = 1.0 * i;
        printf("%.2lf ", mat[i]);

        if (i % dims == 0)
            printf("\n");
    }
    printf("\n");

    cublasHandle_t handle;
    cublasErrChk( cublasCreate(&handle) );

    double alpha = 1.f, beta = 1.f;

    // multiply mat by vec to get results
    cublasErrChk(
        cublasDgemv(
            handle, CUBLAS_OP_N,
            dims, dims,
            &alpha,
            mat, dims,
            vec, 1,
            &beta,
            results, 1
        )
    );

    for (int i = 0; i < dims; i++)
        printf("%.2lf ", results[i]);
    printf("\n");

    cudaErrChk( cudaFree(vec) );
    cudaErrChk( cudaFree(mat) );
    cudaErrChk( cudaFree(results) );

    return 0;
}

Regular malloc/setMatrix() Attempt:

#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <ctime>
#include "cublas_v2.h"

#define cudaErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

static const char *cublasErrChk(cublasStatus_t error)
{
    switch (error)
    {
        case CUBLAS_STATUS_SUCCESS:
            return "CUBLAS_STATUS_SUCCESS";

        case CUBLAS_STATUS_NOT_INITIALIZED:
            return "CUBLAS_STATUS_NOT_INITIALIZED";

        case CUBLAS_STATUS_ALLOC_FAILED:
            return "CUBLAS_STATUS_ALLOC_FAILED";

        case CUBLAS_STATUS_INVALID_VALUE:
            return "CUBLAS_STATUS_INVALID_VALUE";

        case CUBLAS_STATUS_ARCH_MISMATCH:
            return "CUBLAS_STATUS_ARCH_MISMATCH";

        case CUBLAS_STATUS_MAPPING_ERROR:
            return "CUBLAS_STATUS_MAPPING_ERROR";

        case CUBLAS_STATUS_EXECUTION_FAILED:
            return "CUBLAS_STATUS_EXECUTION_FAILED";

        case CUBLAS_STATUS_INTERNAL_ERROR:
            return "CUBLAS_STATUS_INTERNAL_ERROR";
    }

    return "<unknown>";
}

int main() {

    size_t dims = 4;

    double *h_vec, *h_mat, *h_results;

    h_vec = new double[dims];
    h_mat = new double[dims * dims];
    h_results = new double[dims];

    printf("Vector:\n");
    for (int i = 1; i < dims + 1; i++) {
        h_vec[i] = 0.5 * i;
        printf("%.2lf ", h_vec[i]);
    } 
    printf("\n\nMatrix:\n");

    for (int i = 1; i < dims * dims + 1; i++) {
        h_mat[i] = 1.0 * i;
        printf("%.2lf ", h_mat[i]);

        if (i % dims == 0)
            printf("\n");
    }
    printf("\n");

    double *d_vec, *d_mat, *d_results;

    cudaErrChk( cudaMalloc(&d_vec, dims * sizeof(double)) );
    cudaErrChk( cudaMalloc(&d_mat, dims * dims * sizeof(double)) );
    cudaErrChk( cudaMalloc(&d_results, dims * sizeof(double)) );

    cublasHandle_t handle;
    cublasErrChk( cublasCreate(&handle) );

    // copy the data manually to the GPUs
    cublasErrChk( cublasSetVector(dims, sizeof(*d_vec), h_vec, 1, d_vec, 1) );
    cublasErrChk( cublasSetMatrix(dims, dims, sizeof(double), h_mat, dims, d_mat, dims) );

    double alpha = 1.f, beta = 1.f;

    // // multiply mat by vec to get results
    cublasErrChk(
        cublasDgemv(
            handle, CUBLAS_OP_N,
            dims, dims,
            &alpha,
            d_mat, dims,
            d_vec, 1,
            &beta,
            d_results, 1
        )
    );

    cublasErrChk( cublasGetVector(dims, sizeof(*h_results), d_results, 1, h_results, 1) );

    for (int i = 0; i < dims; i++)
        printf("%.2lf ", h_results[i]);
    printf("\n");

    cudaErrChk( cudaFree(d_vec) );
    cudaErrChk( cudaFree(d_mat) );
    cudaErrChk( cudaFree(d_results) );

    delete [] h_vec;
    delete [] h_mat;
    delete [] h_results;

    return 0;
}

Compile with
nvcc -o main main.cu -lcublas

talonmies
  • 70,661
  • 34
  • 192
  • 269
  • 1
    CUBLAS calls like GEMV are asynchronous. You need a synchronization call after the GEMV call before trying to print results. You are printing before the GEMV has finished – talonmies Dec 30 '20 at 05:59
  • 1
    Please add your solution as a short answer for future visitors. Later your will be able to accept your answer and that marks it as answered – talonmies Dec 30 '20 at 14:39

1 Answers1

3

As @talonmies pointed out, the problem was that I was using an asynchronous call and not getting the results back in time. This is fixed by adding cudaDeviceSynchronize() after the cublasDgemv() call:

#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
#include <ctime>
#include "cublas_v2.h"

#define cudaErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

static const char *cublasErrChk(cublasStatus_t error)
{
    switch (error)
    {
        case CUBLAS_STATUS_SUCCESS:
            return "CUBLAS_STATUS_SUCCESS";

        case CUBLAS_STATUS_NOT_INITIALIZED:
            return "CUBLAS_STATUS_NOT_INITIALIZED";

        case CUBLAS_STATUS_ALLOC_FAILED:
            return "CUBLAS_STATUS_ALLOC_FAILED";

        case CUBLAS_STATUS_INVALID_VALUE:
            return "CUBLAS_STATUS_INVALID_VALUE";

        case CUBLAS_STATUS_ARCH_MISMATCH:
            return "CUBLAS_STATUS_ARCH_MISMATCH";

        case CUBLAS_STATUS_MAPPING_ERROR:
            return "CUBLAS_STATUS_MAPPING_ERROR";

        case CUBLAS_STATUS_EXECUTION_FAILED:
            return "CUBLAS_STATUS_EXECUTION_FAILED";

        case CUBLAS_STATUS_INTERNAL_ERROR:
            return "CUBLAS_STATUS_INTERNAL_ERROR";
    }

    return "<unknown>";
}

int main() {

    size_t dims = 4;

    double *vec, *mat, *results;

    cudaErrChk( cudaMallocManaged(&vec, dims * sizeof(double)) );
    cudaErrChk( cudaMallocManaged(&mat, dims * dims * sizeof(double)) );
    cudaErrChk( cudaMallocManaged(&results, dims * sizeof(double)) );

    printf("Vector:\n");
    for (int i = 1; i < dims + 1; i++) {
        vec[i] = 0.5 * i;
        printf("%.2lf ", vec[i]);
    } 
    printf("\n\nMatrix:\n");

    for (int i = 1; i < dims * dims + 1; i++) {
        mat[i] = 1.0 * i;
        printf("%.2lf ", mat[i]);

        if (i % dims == 0)
            printf("\n");
    }
    printf("\n");

    cublasHandle_t handle;
    cublasErrChk( cublasCreate(&handle) );

    double alpha = 1.f, beta = 1.f;

    // multiply mat by vec to get results
    cublasErrChk(
        cublasDgemv(
            handle, CUBLAS_OP_N,
            dims, dims,
            &alpha,
            mat, dims,
            vec, 1,
            &beta,
            results, 1
        )
    );
    cudaDeviceSynchronize();

    for (int i = 0; i < dims; i++)
        printf("%.2lf ", results[i]);
    printf("\n");

    cudaErrChk( cudaFree(vec) );
    cudaErrChk( cudaFree(mat) );
    cudaErrChk( cudaFree(results) );

    return 0;
}