2

Following the same steps in CUDA samples to launch a kernel and sync across the grid using cooperative_groups::this_grid().sync() causes any CUDA API call to fails. While using cooperative_groups::this_thread_block().sync() works fine and gives correct results.

I used the following code and CMakeLists.txt (cmake version 3.11.1) to test it using CUDA 10 on TITAN V GPU (Driver Version 410.73) with Ubuntu 16.04.5 LTS. The code is also available on github in order to make it easy to reproduce the error.

The code reads an array and then reverses it (from [0 1 2 ... 9] to [9 8 7 ... 0]). In order to do this, each thread reads a single element from the array, sync, and then writes its element to the right destination. The code can be easily modified to ensure that this_thread_block().sync() works fine. Simply change arr_size to be less 1024 and use cg::thread_block barrier = cg::this_thread_block(); instead.

test_cg.cu

#include <cuda_runtime_api.h>
#include <stdio.h>
#include <stdint.h>
#include <cstdint>
#include <numeric>
#include <cuda.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;

//********************** CUDA_ERROR
inline void HandleError(cudaError_t err, const char *file, int line) {
    //Error handling micro, wrap it around function whenever possible
    if (err != cudaSuccess) {
        printf("\n%s in %s at line %d\n", cudaGetErrorString(err), file, line);

#ifdef _WIN32
        system("pause");
#else
        exit(EXIT_FAILURE);
#endif
    }
}
#define CUDA_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
//******************************************************************************


//********************** cg kernel 
__global__ void testing_cg_grid_sync(const uint32_t num_elements,
    uint32_t *d_arr){
    uint32_t tid = threadIdx.x + blockDim.x*blockIdx.x;

    if (tid < num_elements){

        uint32_t my_element = d_arr[tid];

        //to sync across the whole grid 
        cg::grid_group barrier = cg::this_grid();

        //to sync within a single block 
        //cg::thread_block barrier = cg::this_thread_block();

        //wait for all reads 
        barrier.sync();

        uint32_t tar_id = num_elements - tid - 1;

        d_arr[tar_id] = my_element;
    }
}
//******************************************************************************


//********************** execute  
void execute_test(const int sm_count){

    //host array 
    const uint32_t arr_size = 1 << 20; //1M 
    uint32_t* h_arr = (uint32_t*)malloc(arr_size * sizeof(uint32_t));
    //fill with sequential numbers
    std::iota(h_arr, h_arr + arr_size, 0);

    //device array 
    uint32_t* d_arr;
    CUDA_ERROR(cudaMalloc((void**)&d_arr, arr_size*sizeof(uint32_t)));
    CUDA_ERROR(cudaMemcpy(d_arr, h_arr, arr_size*sizeof(uint32_t),
        cudaMemcpyHostToDevice));

    //launch config
    const int threads = 512;

    //following the same steps done in conjugateGradientMultiBlockCG.cu 
    //cuda sample to launch kernel that sync across grid 
    //https://github.com/NVIDIA/cuda-samples/blob/master/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG.cu#L436

    int num_blocks_per_sm = 0;
    CUDA_ERROR(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm,
        (void*)testing_cg_grid_sync, threads, 0));

    dim3 grid_dim(sm_count * num_blocks_per_sm, 1, 1), block_dim(threads, 1, 1);

    if(arr_size > grid_dim.x*block_dim.x){
         printf("\n The grid size (numBlocks*numThreads) is less than array size.\n");
         exit(EXIT_FAILURE);
    }
    printf("\n Launching %d blocks, each containing %d threads", grid_dim.x,
        block_dim.x);

    //argument passed to the kernel     
    void *kernel_args[] = {
        (void *)&arr_size,
        (void *)&d_arr, };


    //finally launch the kernel 
    cudaLaunchCooperativeKernel((void*)testing_cg_grid_sync,
        grid_dim, block_dim, kernel_args);


    //make sure everything went okay
    CUDA_ERROR(cudaGetLastError());
    CUDA_ERROR(cudaDeviceSynchronize());


    //get results on the host 
    CUDA_ERROR(cudaMemcpy(h_arr, d_arr, arr_size*sizeof(uint32_t),
        cudaMemcpyDeviceToHost));

    //validate 
    for (uint32_t i = 0; i < arr_size; i++){
        if (h_arr[i] != arr_size - i - 1){
            printf("\n Result mismatch in h_arr[%u] = %u\n", i, h_arr[i]);
            exit(EXIT_FAILURE);
        }
    }
}
//******************************************************************************

int main(int argc, char**argv) {

    //set to Titan V
    uint32_t device_id = 0;
    cudaSetDevice(device_id);

    //get sm count 
    cudaDeviceProp devProp;
    CUDA_ERROR(cudaGetDeviceProperties(&devProp, device_id));
    int sm_count = devProp.multiProcessorCount;

    //execute 
    execute_test(sm_count);

    printf("\n Mission accomplished \n");
    return 0;
}

CMakeLists.txt

cmake_minimum_required(VERSION 3.8 FATAL_ERROR)

set(PROJECT_NAME "test_cg")
project(${PROJECT_NAME} LANGUAGES CXX CUDA)  

#default build type is Release
if (CMAKE_BUILD_TYPE STREQUAL "")
    set(CMAKE_BUILD_TYPE Release)
endif ()

SET(CUDA_SEPARABLE_COMPILATION ON)

########## Libraries/flags Starts Here ######################
find_package(CUDA REQUIRED)
include_directories("${CUDA_INCLUDE_DIRS}")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -lineinfo; -std=c++11; -expt-extended-lambda; -O3; -use_fast_math; -rdc=true;)
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode=arch=compute_70,code=sm_70) #for TITAN V
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -Wall -std=c++11")
########## Libraries/flags Ends Here ######################


########## inc/libs/exe/features Starts Here ######################
set(CMAKE_INCLUDE_CURRENT_DIR ON)
CUDA_ADD_EXECUTABLE(${PROJECT_NAME} test_cg.cu)
target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_11)
set_target_properties(${PROJECT_NAME} PROPERTIES POSITION_INDEPENDENT_CODE  ON)
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)    
target_link_libraries(${PROJECT_NAME} ${CUDA_LIBRARIES} ${CUDA_cudadevrt_LIBRARY})
########## inc/libs/exe/features Ends Here ######################

Running this code gives:

unknown error in /home/ahdhn/test_cg/test_cg.cu at line 67

This is the first line that uses cudaMalloc. I made sure that the code is compiled for the correct architecture by querying __CUDA_ARCH__ from the device and the results is 700. Kindly let me know if you spot me doing something wrong in the code or the CMakeLists.txt file.

talonmies
  • 70,661
  • 34
  • 192
  • 269
ahmed
  • 59
  • 5
  • The cudaMalloc call should be the call which triggers context initialization. If that is failing you have probably found a bug in the CUDA runtime. – talonmies Nov 27 '18 at 08:23
  • 1
    Maybe your CUDA install is broken. Maybe your cmake setup is not creating the correct compilation settings. In any event, this code could not possibly work, because you are attempting to use an array size of `1<<20` and there isn't any current CUDA GPU that has an instantaneous thread capacity that is that large. Volta has an instantaneous capacity of `2048*80`. I would suggest compiling from the command line to remove CMake from the equation. When I do that on a functional volta setup, I get the error "The grid size (numBlocks*numThreads) is less than array size." – Robert Crovella Nov 27 '18 at 14:52
  • Thank you for the helpful comments. With external help, I was able to get the code running using CMake and with array size of `1<<20`. The trick is to add `string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_70,code=sm_70 --cudart shared")` after the second `set(CUDA_NVCC_FLAGS.....`. The reason is that I only have `libcudadevrt.a` under my `/usr/local/cuda-10.0/lib64/` and so I have to signal CUDA to link shared/dynamic run-time library since the default is to link to static. – ahmed Nov 27 '18 at 20:22
  • Your code cannot work correctly for an array size of `1<<20`. The occupancy API call could not be returning a `num_blocks_per_sm` value larger than 4. With 4 blocks per SM, the largest grid size is 2048*80 on volta. This is less than `1<<20` – Robert Crovella Nov 27 '18 at 22:12
  • @RobertCrovella agreed. I had to reduce the array size to get the code working and produce some meaningful results. – ahmed Nov 28 '18 at 15:50
  • The kernel here will deadlock. The grid synchronization needs to be done by all threads in all blocks, so the if statement needs to be split into two pieces, with the sync happening in between. – dlasalle Aug 28 '19 at 21:36

1 Answers1

2

With external help, the solution that got the code working is to add string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_70,code=sm_70 --cudart shared") after the second set(CUDA_NVCC_FLAGS...... The reason is that I only have libcudadevrt.a under my /usr/local/cuda-10.0/lib64/ and so I have to signal CUDA to link shared/dynamic run-time library since the default is to link to static. string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_70,code=sm_70") after the second set(CUDA_NVCC_FLAGS...... The reason is that the sm_70 flag was not passed to the linker properly.

Additionally, using only CUDA_NVCC_FLAGS will only pass the sm_70 info to the compiler not the linker. While only using CMAKE_NVCC_FLAGS will report error: namespace "cooperative_groups" has no member "grid_group" error.

ahmed
  • 59
  • 5
  • 1
    There is no dynamic version of `libcudadevrt.a` and the `--cudart shared` flag does not affect linking to the libcudadevrt, which is the device runtime. This affects linking to the ordinary runtime, libcudart. – Robert Crovella Nov 27 '18 at 22:13