-1

I have the source code which can parallel a model using GPU library. Now I was trying to compile the source code under UNIX system. However the computation time wasn't speed up when I used more than one GPU nodes (5,10,20,30......). So I think I didn't use the right compiler. 'solve.cu' is the source code which uses the CUDA library. The makefile I wrote is as the following:

    # SHELL=/bin/ksh
    SOURCELOC = 
    UTILITYLOC = 
    NEWMOD = 
    PROGRAM = mf2k
    INCDIR= .

    # Define the Fortran compile flags

    F77FLAGS=
    F90FLAGS=
    F77= mpif77
    F90= mpif90

    # 
    # Define the C compile flags
    # -D_UF defines UNIX naming conventions for mixed language compilation.
    # 
    CFLAGS= -D_UF -O3
    CC= mpicc

    # Define GMG objects
    #
    GMG = r_vector.o\
    solvers.o\
    ccfd.o\
    mf2kgmg.o\
    gmg1.o

    # 
    # Define the Cuda compile flags
    # -D_UF defines UNIX naming conventions for mixed language compilation.
    # 
    CUDAFLAGS= 
    CUDACC= nvcc

    CUDA_INC= -I /opt/apps/cuda/4.1.28/include
    VT_MPI_INC= -I /opt/apps/intel13_1/openmpi/1.6.4/include
    CUDA_LIB64= /opt/apps/cuda/4.1.28/lib64 
    VT_MPI_LIB= /opt/apps/intel13_1/openmpi/1.6.4/lib

    LFLAGS = -L$(VT_MPI_LIB) -lmpi -L$(CUDA_LIB64) -lcuda -lcudart

    CUSPINCDIR= -I /home/zhangmj/MF2K_JIXIAOHUI_MAKE


    # Define CUSP objects
    #
    CUSP = solve.o

    # Define the libraries

    #SYSLIBS= -lmisalign -ldgc -lm 
    SYSLIBS= -lc
    USRLIB  = 

    # Define all object files which make up Modtools

    OBJECTS = \
outputA_b.o \
    mf2k.o \
    mhc1.o \
    ctime.o \
    daf1.o \
    de45.o \
    glo1bas6.o \
    gutsdaf.o \
    gwf1bas6.o \
    gwf1bcf6.o \
    gwf1chd6.o \
    gwf1drn6.o \
    gwf1drt1.o \
    gwf1ets1.o \
    gwf1evt6.o \
    gwf1fhb1.o \
    gwf1gag5.o \
    gwf1ghb6.o \
    gwf1hfb6.o \
    gwf1huf2.o \
    gwf1ibs6.o \
    gwf1lak3.o \
    gwf1lpf1.o \
    gwf1mnw1.o \
    gwf1mnw2.o \
    gwf1mnwi.o \
    gwf1rch6.o \
    gwf1res1.o \
    gwf1riv6.o \
    gwf1sfr2.o \
    gwf1str6.o \
    gwf1sub1.o \
    gwf1swt1.o \
    gwf1wel6.o \
    hufutl2.o \
    hydmod.o \
    lmg1.o \
    lmt6.o \
    memchk.o \
    obs1adv2.o \
    obs1bas6.o \
    obs1drn6.o \
    obs1drt1.o \
    obs1ghb6.o \
    obs1riv6.o \
    obs1str6.o \
    parutl1.o \
    pcg2.o \
    pes1bas6.o \
    pes1gau1.o \
    rtedaf.o \
    sen1bas6.o \
    sen1chd6.o \
    sen1drn6.o \
    sen1drt1.o \
    sen1ets1.o \
    sen1evt6.o \
    sen1ghb6.o \
    sen1hfb6.o \
    sen1huf2.o \
    sen1lpf1.o \
    sen1rch6.o \
    sen1riv6.o \
    sen1str6.o \
    sen1wel6.o \
    sip5.o \
    sor5.o \
    utl6.o \
    para-non.o

    install: mf2k

    # Define Task Function Program Modtools

    all: mf2k

    # Define what Modtools is

    mf2k: $(OBJECTS) $(GMG) $(CUSP) -$(F77) $(F77FLAGS) -o mf2k -L /opt/apps/cuda/4.1.28/lib64 -lcudart $(OBJECTS) $(GMG) $(CUSP) $(USRLIB) $(SYSLIBS)

    # Modtools_Object codes

    mf2k.o: mf2k.f
$(F77) $(F77FLAGS) -c mf2k.f


    para-non.o: serial/para-non.f
$(F77) $(F77FLAGS) -I$(INCDIR) -c serial/para-non.f


    # Pth_Object codes of Modtools

    .f.o:
$(F77) $(F77FLAGS) -c $<

    mhc1.o:
$(F90) $(F90FLAGS) -c mhc1.f90

    .c.o:
$(CC) $(CFLAGS) -c $<

    solve.o: solve.cu
$(CUDACC) -c -arch sm_13 $(LFLAGS) $(VT_MPI_INC) $(CUDA_INC) $(CUSPINCDIR) solve.cu

    #  end

I would greatly appreciate any help. Thank you so much.

1 Answers1

1

There is only one compiler you can use to compile that CUSP sparse linear algebra code in your application: nvcc, and you are already using it.

I don't understand how you reached the conclusion that the GPU compiler has anything to do with MPI scalability of your code, but it clearly isn't the source of whatever problem you are having.

talonmies
  • 70,661
  • 34
  • 192
  • 269
  • @user2639490: if this answered your question, please consider accepting the answer. This will mark the question as answered and make it easier to find by search for other people in future, if they have the same problem. – talonmies Oct 04 '13 at 04:33