CMake CUDA: static link with cublas

Question

I want to compile CUDALibrarySamples. cuFFT uses cmake and I want to compile and link 1d_c2c application with the static version of cufft lib (-lcufft_static). Using Makefiles is trivial I have added -lcufft_static in

nvcc -x cu $(FLAGS) $(INC) 1d_c2c_example.cpp -o 1d_c2c_example $(LIBS)

However, I am not sure how can i do the same using cmake. I have noticed that cmake has static flags: CUDA_cublasLt_static_LIBRARY, CUDA_cufft_static_LIBRARY etc. So my question is how can I enable them? Thank you in advance!

I have tried to

target_link_libraries(${ROUTINE}_example PRIVATE ${CUDA_cufft_static_LIBRARY})

but it does not seem to work.

According to @paleonix suggestion I did the following:

target_link_libraries(${ROUTINE}_example PRIVATE CUDA::cufft_static CUDA::cudart).

BUT I get the following error:

/usr/bin/ld: /opt/cuda/lib64/libcufft_static.a(cbdouble_32bit_prime_callback_RT_SM35_plus.o): in function __sti____cudaRegisterAll()': cbdouble_32bit_prime_callback_RT_SM35_plus.compute_86.cudafe1.cpp:(.text.startup+0x1d): undefined reference to __cudaRegisterLinkedBinary_61_cbdouble_32bit_prime_callback_RT_SM35_plus_compute_86_cpp1_ii_dc5d5345

I am trying to build the following example for CUDA Library examples:

1d_c2c_example.cpp

#include <complex>
#include <iostream>
#include <random>
#include <vector>    
#include <cuda_runtime.h>
#include <cufftXt.h>
#include "cufft_utils.h"
int main(int argc, char *argv[]) {
    cufftHandle plan;
    cudaStream_t stream = NULL;
    
    int n = 8;
    int batch_size = 2;
    int fft_size = batch_size * n;
    
    using scalar_type = float;
    using data_type = std::complex<scalar_type>;
    std::vector<data_type> data(fft_size);
    for (int i = 0; i < fft_size; i++) {
        data[i] = data_type(i, -i);
    }
    
    std::printf("Input array:\n");
    for (auto &i : data) {
        std::printf("%f + %fj\n", i.real(), i.imag());
    }
    std::printf("=====\n");
    
    cufftComplex *d_data = nullptr;
    
    CUFFT_CALL(cufftCreate(&plan));
    CUFFT_CALL(cufftPlan1d(&plan, data.size(), CUFFT_C2C, batch_size));
    
    CUDA_RT_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    CUFFT_CALL(cufftSetStream(plan, stream));
    
    // Create device data arrays
    CUDA_RT_CALL(cudaMalloc(reinterpret_cast<void **>(&d_data), sizeof(data_type) * data.size()));
    CUDA_RT_CALL(cudaMemcpyAsync(d_data, data.data(), sizeof(data_type) * data.size(), cudaMemcpyHostToDevice, stream));
    
    CUFFT_CALL(cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD));
    CUFFT_CALL(cufftExecC2C(plan, d_data, d_data, CUFFT_INVERSE));
    
    CUDA_RT_CALL(cudaMemcpyAsync(data.data(), d_data, sizeof(data_type) * data.size(), cudaMemcpyDeviceToHost, stream));
    
    CUDA_RT_CALL(cudaStreamSynchronize(stream));
    
    /* free resources */
    CUDA_RT_CALL(cudaFree(d_data))
    CUFFT_CALL(cufftDestroy(plan));
    CUDA_RT_CALL(cudaStreamDestroy(stream));
    CUDA_RT_CALL(cudaDeviceReset());
    return EXIT_SUCCESS;
}

CMakeLists.txt

cmake_minimum_required(VERSION 3.18)

set(ROUTINE 1d_c2c)

project(
  "${ROUTINE}_example"
  DESCRIPTION "GPU-Accelerated Fast Fourier Transforms"
  HOMEPAGE_URL "https://docs.nvidia.com/cuda/cufft/index.html"
  LANGUAGES CXX CUDA)

set(CMAKE_CUDA_ARCHITECTURES 80)
find_package(CUDAToolkit REQUIRED)

set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

if("${CMAKE_BUILD_TYPE}" STREQUAL "")
  set(CMAKE_BUILD_TYPE Release)
endif()

set(CMAKE_CUDA_ARCHITECTURES 80)
#if(CMAKE_CUDA_ARCHITECTURES LESS 60)
    #set(CMAKE_CUDA_ARCHITECTURES 60 70 75 80 86)
    #endif()
set(BUILD_SHARED_LIBS OFF)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(CUFFT_LIBRARIES ${CUDA_cufft_LIBRARY} ${CUDA_culibos_LIBRARY} ${CUDA_cudart_LIBRARY})


add_executable(${ROUTINE}_example)

target_include_directories(${ROUTINE}_example
                           PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} 
                           ${CMAKE_SOURCE_DIR}/../utils)

target_sources(${ROUTINE}_example
               PRIVATE ${PROJECT_SOURCE_DIR}/${ROUTINE}_example.cpp)

set(CMAKE_CUDA_ARCHITECTURES 80)
#target_link_libraries(${ROUTINE}_example PRIVATE ${CUDA_cufft_static_LIBRARY} CUDA::cufft CUDA::cudart)
target_link_libraries(${ROUTINE}_example PRIVATE CUDA::cufft_static CUDA::cudart)

When I removed

find_package(CUDAToolkit REQUIRED)

cmake shows me the following error:

CMake Error at CMakeLists.txt:82 (target_link_libraries):
   Target "1d_c2c_example" links to:

     CUDA::cufft_static

   but the target was not found.

`target_link_libraries(${ROUTINE}_example PRIVATE cufft_static)` when you use [`FindCUDAToolkit()`](https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html) instead of the language, add a `CUDA::` in front of the library target. See the linked docs. — paleonix, Apr 20 '23 at 08:17
You are trying to use deprecated functionality from the old [`FindCUDA`](https://cmake.org/cmake/help/latest/module/FindCUDA.html). I would recommend against that if it is possible to use a CMake version that supports the newer API, 3.18 is a good start. — paleonix, Apr 20 '23 at 08:35
I will remove find_package and retry. Regarding .cu I do not think that this is an issue because with the Makefile it compiles. Do you think that there is an issue with .cu and cmake? — MANOS, Apr 20 '23 at 12:31

paleonix · Accepted Answer · 2023-04-25T12:13:47.473

The main reason, that you still had linker problems after using CUDA::cufft_static was that static cuFFT needs relocatable device code enabled. While this is done in CMake via the CUDA_SEPARABLE_COMPILATION property for compilation, we need it for linking which is achieved via the CUDA_RESOLVE_DEVICE_SYMBOLS property.

I will retract my statement that one should not use both the CUDA language and find_package(CUDAToolkit REQUIRED). While the cufft_static target is available when using just the language, it does not automatically link culibos. So the more elegant solution seems to be to use CUDA::cufft_static from the package.

CUDA architecture and build type should be set during the first configuration either through command line arguments cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_ARCHITECTURES=80 or by using ccmake to get a nice console ui.

cmake_minimum_required(VERSION 3.18)

set(ROUTINE 1d_c2c)

project(
  "${ROUTINE}_example"
  DESCRIPTION "GPU-Accelerated Fast Fourier Transforms"
  HOMEPAGE_URL "https://docs.nvidia.com/cuda/cufft/index.html"
  LANGUAGES CXX CUDA)

find_package(CUDAToolkit REQUIRED)

add_executable(${ROUTINE}_example)

set_target_properties(${ROUTINE}_example
  PROPERTIES
    CUDA_RESOLVE_DEVICE_SYMBOLS ON
    RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

target_compile_features(${ROUTINE}_example
  PRIVATE cxx_std_11)

target_sources(${ROUTINE}_example
  PRIVATE ${PROJECT_SOURCE_DIR}/${ROUTINE}_example.cpp)

target_include_directories(${ROUTINE}_example
  PRIVATE ${CMAKE_SOURCE_DIR}/../utils)

target_link_libraries(${ROUTINE}_example PRIVATE
  PRIVATE
    CUDA::cufft_static
    CUDA::cudart_static)

Thanks to @RobertJMaynard who helped me figure out how to do the linking right without changing the source file to .cu (i.e. use CUDA_RESOLVE_DEVICE_SYMBOLS and add CUDA::cudart_static) over at the CMake Discourse.

I am having the same issue with bazel and tensorflow. What is the equivalent for CUDA_RESOLVE_DEVICE_SYMBOLS in bazel? — MANOS, Apr 25 '23 at 12:59
I don't know bazel. Please open a new question. Googling the problem, [this bazel sample](https://github.com/bazel-contrib/rules_cuda/blob/main/examples/rdc/BUILD.bazel#L17) seems like it could help, i.e. `rdc = 1`. — paleonix, Apr 25 '23 at 13:12

CMake CUDA: static link with cublas

1d_c2c_example.cpp

CMakeLists.txt

1 Answers1