1

I am trying to use CHOLMOD with CUDA acceleration in SuiteSparse 4.4.4. I compiled it according to the user guide and I could run gpu.sh under Demo folder successfully, which showed that the GPU was doing part of the work. However, when I tried to run my own code using CHOLMOD, I found that the number of GPU calls was always 0. I do set Common->useGPU to 1, and the environment variable CHOLMOD_USE_GPU is also set to 1. My Makefile is like the following. The library paths are correct. Any suggestion for me?

Actually I should have mentioned that I am just running a simplest test case to solve a linear system.

I tried several matrices from UF Sparse Matrix Collection, but nvprof showed that no CUDA application was profiled.

Some of the matrices I tried:

bmw7st_1: http://www.cise.ufl.edu/research/sparse/matrices/GHS_psdef/bmw7st_1.html

nd6k: http://www.cise.ufl.edu/research/sparse/matrices/ND/nd6k.html

nd24k: http://www.cise.ufl.edu/research/sparse/matrices/ND/nd24k.html

Code:

#include <stdio.h>
#include <time.h>
#include <unistd.h>
#include <assert.h>
#include <sys/time.h>
#include "cholmod.h"

int main (void)
{
    struct timeval t1, t2;
    double elapsedTime;

    const char* matFile = "../bmw7st_1.mtx";
    FILE* fp = fopen(matFile, "r");
    assert(fp != NULL);

    cholmod_sparse *A ;
    cholmod_dense *x, *b;
    cholmod_factor *L ;

    cholmod_common* c = (cholmod_common*)malloc(sizeof(cholmod_common));
    cholmod_start (c) ; /* start CHOLMOD */
    c->useGPU = 1;
    c->supernodal = CHOLMOD_SUPERNODAL;

    A = cholmod_read_sparse (fp, c) ; /* read in a matrix */
    cholmod_print_sparse (A, "A", c) ; /* print the matrix */
    fclose(fp);

    if (A == NULL || A->stype == 0) /* A must be symmetric */
    {
        cholmod_free_sparse (&A, c) ;
        cholmod_finish (c) ;
        return (0) ;
    }

    b = cholmod_ones (A->nrow, 1, A->xtype, c) ; /* b = ones(n,1) */

    gettimeofday(&t1, NULL);
    L = cholmod_analyze (A, c) ; /* analyze */
    cholmod_factorize (A, L, c) ; /* factorize */
    x = cholmod_solve (CHOLMOD_A, L, b, c) ; /* solve Ax=b */
    gettimeofday(&t2, NULL);
    elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0;
    elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0;
    printf("Time: %.4f ms\n", elapsedTime);

    cholmod_free_factor (&L, c) ; /* free matrices */
    cholmod_free_sparse (&A, c) ;
    cholmod_free_dense (&x, c) ;
    cholmod_free_dense (&b, c) ;
    cholmod_finish (c) ; /* finish CHOLMOD */
    return (0) ;
}

Makefile:

CC = gcc

CFLAGS = -g -Wall -O2 \
-lrt -lgfortran \
-gdwarf-2

LIBS = $(CHOLMOD)/Lib/libcholmod.a \
$(AMD)/Lib/libamd.a \
$(COLAMD)/Lib/libcolamd.a \
$(LAPACK)/liblapack.a \
$(OPENBLAS)/lib/libopenblas.so \
$(XERBLA)/libcerbla.a \
$(METIS)/libmetis.a \
$(CAMD)/Lib/libcamd.a \
$(CCOLAMD)/Lib/libccolamd.a \
$(SUITESPARSE)/SuiteSparse_config/libsuitesparseconfig.a \
$(CUDART_LIB) \
$(CUBLAS_LIB)

HEADER_DIR = $(CHOLMOD)/Include
CONFIG_HEADER_DIR = $(SUITESPARSE)/SuiteSparse_config

OBJ_DIR = .

BIN_DIR = .

INCLUDES = -I$(HEADER_DIR) \
-I$(CONFIG_HEADER_DIR)

SRCS = $(shell ls *.c)

OBJS = $(SRCS:.c=.o)

OBJS_BUILD = $(shell ls $(OBJ_DIR)/*.o)

APP = prog

RM = rm -f

all: $(APP)

$(APP): $(OBJS)
        $(CC) $(CFLAGS) -o $(BIN_DIR)/$(APP) $(OBJS_BUILD) $(LIBS)

%.o: %.c $(HEADER_DIR)/*.h $(CONFIG_HEADER_DIR)/*.h
        $(CC) $(CFLAGS) $(INCLUDES) -c $< -o $(OBJ_DIR)/$@

clean:
        $(RM) $(OBJS_BUILD) $(APP)
timrau
  • 22,578
  • 4
  • 51
  • 64
Shenghan Gao
  • 71
  • 1
  • 8
  • 1
    You should actually provide a complete test case, not just a makefile. – Robert Crovella Aug 14 '15 at 18:24
  • @RobertCrovella Sorry about that, the test case is added. – Shenghan Gao Aug 14 '15 at 18:44
  • 2
    Well, I guess nobody could try your test case without the matrix file. Can you use a standard matrix file that someone could grab? How big is the matrix? I'm pretty sure CHOLMOD (i.e. SuiteSparse) makes intelligent decisions about whether to use the GPU. Small test cases may not use the GPU. – Robert Crovella Aug 14 '15 at 18:52
  • @RobertCrovella Thank you. Actually I tried several matrices from UF Sparse Matrix Collection, but none of them used the GPU. – Shenghan Gao Aug 14 '15 at 19:10

1 Answers1

1

Referring to section 7, p34 of the CHOLMOD UserGuide.pdf that ships with SuiteSparse 4.4.4:

Only the long integer version of CHOLMOD can leverage GPU acceleration.

The long integer version is distinguished by api calls like cholmod_l_start instead of cholmod_start.

With the following modifications to your program:

#include <stdio.h>
#include <time.h>
#include <unistd.h>
#include <assert.h>
#include <sys/time.h>
#include "cholmod.h"

int main (void)
{
    struct timeval t1, t2;
    double elapsedTime;

    const char* matFile = "../Matrix/nd6k/nd6k.mtx";
    FILE* fp = fopen(matFile, "r");
    assert(fp != NULL);

    cholmod_sparse *A ;
    cholmod_dense *x, *b;
    cholmod_factor *L ;

    cholmod_common* c = (cholmod_common*)malloc(sizeof(cholmod_common));
    cholmod_l_start (c) ; /* start CHOLMOD */
    c->useGPU = 1;
    c->supernodal = CHOLMOD_SUPERNODAL;

    A = cholmod_l_read_sparse (fp, c) ; /* read in a matrix */
    cholmod_l_print_sparse (A, "A", c) ; /* print the matrix */
    fclose(fp);

    if (A == NULL || A->stype == 0) /* A must be symmetric */
    {
        cholmod_l_free_sparse (&A, c) ;
        cholmod_l_finish (c) ;
        return (0) ;
    }

    b = cholmod_l_ones (A->nrow, 1, A->xtype, c) ; /* b = ones(n,1) */

    gettimeofday(&t1, NULL);
    L = cholmod_l_analyze (A, c) ; /* analyze */
    cholmod_l_factorize (A, L, c) ; /* factorize */
    x = cholmod_l_solve (CHOLMOD_A, L, b, c) ; /* solve Ax=b */
    gettimeofday(&t2, NULL);
    elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0;
    elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0;
    printf("Time: %.4f ms\n", elapsedTime);
    cholmod_l_gpu_stats(c);
    cholmod_l_free_factor (&L, c) ; /* free matrices */
    cholmod_l_free_sparse (&A, c) ;
    cholmod_l_free_dense (&x, c) ;
    cholmod_l_free_dense (&b, c) ;
    cholmod_l_finish (c) ; /* finish CHOLMOD */
    return (0) ;
}

I get output like this:

$ ./prog
CHOLMOD sparse:  A:  18000-by-18000, nz 3457658, upper.  OK
Time: 14570.3950 ms

CHOLMOD GPU/CPU statistics:
SYRK  CPU calls          888 time   1.0637e-01
      GPU calls          213 time   8.9194e-02
GEMM  CPU calls          711 time   1.1511e-01
      GPU calls          213 time   1.9351e-03
POTRF CPU calls          217 time   3.2180e-02
      GPU calls            5 time   1.5788e-01
TRSM  CPU calls          217 time   6.0409e-01
      GPU calls            4 time   5.6943e-02
time in the BLAS: CPU   8.5774e-01 GPU   3.0595e-01 total:   1.1637e+00
assembly time   0.0000e+00    0.0000e+00
$

indicating the GPU is being used.

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257