4

I'm working with a slam system, i've install dso, which the code can be seen here::

https://github.com/JakobEngel/dso

Everything works fine, I manage to compile and run without errors. But know I want to parallelize the code, using CUDA. I'm having lot's of trouble adapting it's CMakeLists.txt in order to be able to use CUDA. The original CMakeLists from dso is available here:

dso CMakeLists.txt

I'm trying to adapt it basing my changes on this implementation of another author on another SLAM system:

ORB SLAM 2 CMakeLists.txt using CUDA

Right now my CMakeLists, with my changes (not working), is like this:

SET(PROJECT_NAME DSO)

PROJECT(${PROJECT_NAME})
CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
#set(CMAKE_VERBOSE_MAKEFILE ON)

set(BUILD_TYPE Release)
#set(BUILD_TYPE RelWithDebInfo) 

set(EXECUTABLE_OUTPUT_PATH bin)
set(LIBRARY_OUTPUT_PATH lib)
set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)

# required libraries
#SET(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH} "/usr/include")
find_package(SuiteParse REQUIRED)
find_package(Eigen3 REQUIRED)
find_package(Boost)

# optional libraries
find_package(LibZip QUIET)
find_package(Pangolin 0.2 QUIET)
find_package(OpenCV QUIET)
#find_package(OpenACC)

# flags
add_definitions("-DENABLE_SSE")
set(CMAKE_CXX_FLAGS
   "${SSE_FLAGS} -O3 -g -std=c++11"
)

set(CMAKE_C_FLAGS
    "${SSE_FLAGS} -O3 -g -std=c++11"
)

#LIST(APPEND CMAKE_C_FLAGS "-Wall -Wextra -DUSE_NVTX") <<<< Error: doesn't recognize -Wall -Wextra
#LIST(APPEND CMAKE_CXX_FLAGS "-Wall -Wextra -DUSE_NVTX") << Error: doesn't recognize -Wall -Wextra

find_package(CUDA REQUIRED)
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
SET(CUDA_HOST_COMPILER /usr/bin/g++)
LIST(APPEND CUDA_NVCC_FLAGS "--compiler-options -fno-strict-aliasing -use_fast_math -ccbin gcc-5")

set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11")

if (MSVC)
     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")
endif (MSVC)

set(CMAKE_LIBRARY_OUTPUT_DIRECTORY lib)

cuda_include_directories(
  ${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc
)



# Sources files
set(dso_SOURCE_FILES
  ${PROJECT_SOURCE_DIR}/src/FullSystem/FullSystem.cpp
  ${PROJECT_SOURCE_DIR}/src/FullSystem/FullSystemOptimize.cpp
  ${PROJECT_SOURCE_DIR}/src/FullSystem/FullSystemOptPoint.cpp
  ${PROJECT_SOURCE_DIR}/src/FullSystem/FullSystemDebugStuff.cpp
  ${PROJECT_SOURCE_DIR}/src/FullSystem/FullSystemMarginalize.cpp
  ${PROJECT_SOURCE_DIR}/src/FullSystem/Residuals.cpp
  ${PROJECT_SOURCE_DIR}/src/FullSystem/CoarseTracker.cpp
  ${PROJECT_SOURCE_DIR}/src/FullSystem/CoarseInitializer.cpp
  ${PROJECT_SOURCE_DIR}/src/FullSystem/ImmaturePoint.cpp
  ${PROJECT_SOURCE_DIR}/src/FullSystem/HessianBlocks.cpp
  ${PROJECT_SOURCE_DIR}/src/FullSystem/PixelSelector2.cpp
  ${PROJECT_SOURCE_DIR}/src/OptimizationBackend/EnergyFunctional.cpp
  ${PROJECT_SOURCE_DIR}/src/OptimizationBackend/AccumulatedTopHessian.cpp
  ${PROJECT_SOURCE_DIR}/src/OptimizationBackend/AccumulatedSCHessian.cpp
  ${PROJECT_SOURCE_DIR}/src/OptimizationBackend/EnergyFunctionalStructs.cpp
  ${PROJECT_SOURCE_DIR}/src/util/settings.cpp
  ${PROJECT_SOURCE_DIR}/src/util/Undistort.cpp
  ${PROJECT_SOURCE_DIR}/src/util/globalCalib.cpp
)


include_directories(
  ${PROJECT_SOURCE_DIR}/src
  ${PROJECT_SOURCE_DIR}/thirdparty/Sophus
  ${PROJECT_SOURCE_DIR}/thirdparty/sse2neon
  ${EIGEN3_INCLUDE_DIR}
) 


# decide if we have pangolin
if (Pangolin_FOUND)
    message("--- found PANGOLIN, compiling dso_pangolin library.")
    include_directories( ${Pangolin_INCLUDE_DIRS} ) 
    set(dso_pangolin_SOURCE_FILES 
      ${PROJECT_SOURCE_DIR}/src/IOWrapper/Pangolin/KeyFrameDisplay.cpp
      ${PROJECT_SOURCE_DIR}/src/IOWrapper/Pangolin/PangolinDSOViewer.cpp)
    set(HAS_PANGOLIN 1)
else ()
    message("--- could not find PANGOLIN, not compiling dso_pangolin library.")
    message("    this means there will be no 3D display / GUI available for dso_dataset.")
    set(dso_pangolin_SOURCE_FILES )
    set(HAS_PANGOLIN 0)
endif ()

# decide if we have openCV
if (OpenCV_FOUND)
    message("--- found OpenCV, compiling dso_opencv library.")
    include_directories( ${OpenCV_INCLUDE_DIRS} )
    set(dso_opencv_SOURCE_FILES 
      ${PROJECT_SOURCE_DIR}/src/IOWrapper/OpenCV/ImageDisplay_OpenCV.cpp
      ${PROJECT_SOURCE_DIR}/src/IOWrapper/OpenCV/ImageRW_OpenCV.cpp)
    set(HAS_OPENCV 1)
else ()
    message("--- could not find OpenCV, not compiling dso_opencv library.")
    message("    this means there will be no image display, and image read / load functionality.")
    set(dso_opencv_SOURCE_FILES 
      ${PROJECT_SOURCE_DIR}/src/IOWrapper/ImageDisplay_dummy.cpp
      ${PROJECT_SOURCE_DIR}/src/IOWrapper/ImageRW_dummy.cpp)
    set(HAS_OPENCV 0)
endif ()

# decide if we have ziplib.
if (LIBZIP_LIBRARY)
    message("--- found ziplib (${LIBZIP_VERSION}), compiling with zip capability.")
    add_definitions(-DHAS_ZIPLIB=1)
    include_directories( ${LIBZIP_INCLUDE_DIR_ZIP} ${LIBZIP_INCLUDE_DIR_ZIPCONF} ) 
else()
    message("--- not found ziplib (${LIBZIP_LIBRARY}), compiling without zip capability.")
    set(LIBZIP_LIBRARY "")
endif()


# compile main library.
include_directories( ${CSPARSE_INCLUDE_DIR} ${CHOLMOD_INCLUDE_DIR}) 
cuda_add_library(dso SHARED ${dso_SOURCE_FILES} ${dso_opencv_SOURCE_FILES} ${dso_pangolin_SOURCE_FILES} 
${PROJECT_SOURCE_DIR}/src/teste.cu
)

#set_property( TARGET dso APPEND_STRING PROPERTY COMPILE_FLAGS -Wall )


if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") # OSX
    set(BOOST_THREAD_LIBRARY boost_thread-mt)
else()
    set(BOOST_THREAD_LIBRARY boost_thread)
endif()

# build main executable (only if we have both OpenCV and Pangolin)
if (OpenCV_FOUND AND Pangolin_FOUND)
    message("--- compiling dso_dataset.")
    add_executable(dso_dataset ${PROJECT_SOURCE_DIR}/src/main_dso_pangolin.cpp)
    target_link_libraries(dso_dataset dso boost_system cxsparse ${BOOST_THREAD_LIBRARY} ${LIBZIP_LIBRARY} ${Pangolin_LIBRARIES} ${OpenCV_LIBS})
else()
    message("--- not building dso_dataset, since either don't have openCV or Pangolin.")
endif()

unset(CMAKE_RUNTIME_OUTPUT_DIRECTORY)

So, 'main_dso_pangolin.cpp' is my main file. At this point, with only this changes the code compiles. But i wanted to try if i was able to make some CUDA code. In order to do this I created a 'teste.cu' file, that has the same code as one of the cuda samples, like this:

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

// CUDA runtime
#include </usr/local/cuda-9.0/include/cuda_runtime.h>
#include <cuda.h>

// helper functions and utilities to work with CUDA
#include </usr/local/cuda-9.0/samples/common/inc/helper_functions.h>
#include </usr/local/cuda-9.0/samples/common/inc/helper_cuda.h>

__global__ static void timedReduction(const float *input, float *output, clock_t *timer)
{
    // __shared__ float shared[2 * blockDim.x];
    extern __shared__ float shared[];

    const int tid = threadIdx.x;
    const int bid = blockIdx.x;

    if (tid == 0) timer[bid] = clock();

    // Copy input.
    shared[tid] = input[tid];
    shared[tid + blockDim.x] = input[tid + blockDim.x];

    // Perform reduction to find minimum.
    for (int d = blockDim.x; d > 0; d /= 2)
    {
        __syncthreads();

        if (tid < d)
        {
            float f0 = shared[tid];
            float f1 = shared[tid + d];

            if (f1 < f0)
            {
                shared[tid] = f1;
            }
        }
    }

    // Write result.
    if (tid == 0) output[bid] = shared[0];

    __syncthreads();

    if (tid == 0) timer[bid+gridDim.x] = clock();
}

#define NUM_BLOCKS    64
#define NUM_THREADS   256

void xx(int argc, char** argv){

    printf("CUDA Clock sample\n");

    // This will pick the best possible CUDA capable device
    int dev = findCudaDevice(argc, (const char **)argv);

    float *dinput = NULL;
    float *doutput = NULL;
    clock_t *dtimer = NULL;

    clock_t timer[NUM_BLOCKS * 2];
    float input[NUM_THREADS * 2];

    for (int i = 0; i < NUM_THREADS * 2; i++)
    {
        input[i] = (float)i;
    }

    checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
    checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
    checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));

    checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));

    timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 *NUM_THREADS>>>(dinput, doutput, dtimer);

    checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));

    checkCudaErrors(cudaFree(dinput));
    checkCudaErrors(cudaFree(doutput));
    checkCudaErrors(cudaFree(dtimer));

    long double avgElapsedClocks = 0;

    for (int i = 0; i < NUM_BLOCKS; i++)
    {
        avgElapsedClocks += (long double) (timer[i + NUM_BLOCKS] - timer[i]);
    }

    avgElapsedClocks = avgElapsedClocks/NUM_BLOCKS;
    printf("Average clocks/block = %Lf\n", avgElapsedClocks);

}

And in my main, the first thing i do is to call this function. This time, when i do 'cmake' and 'make i get errors like:

/home/cesar/Documents/dso/src/teste.cu:18:21: error: ‘threadIdx’ was not declared in this scope
     const int tid = threadIdx.x;

/home/cesar/Documents/dso/src/teste.cu:19:21: error: ‘blockIdx’ was not declared in this scope
     const int bid = blockIdx.x;

I've install CUDA Toolkit correctly, but here is the version:

cesar@cesar-X550JX:/usr/local/cuda/bin$ /usr/local/cuda/bin/nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2017 NVIDIA Corporation
Built on Fri_Sep__1_21:08:03_CDT_2017
Cuda compilation tools, release 9.0, V9.0.176

What do you think i'm doing wrong or i'm missing? I'm having many difficulties adapting CMakeLists.txt due to its complexity and well defined structure.

--- EDIT ---

Running with make -j VERBOSE=1 i get this messages which tells me that a regular c++ compiler is being used:

/usr/bin/c++  -fPIC  -O3 -g -std=c++11 -D_FORCE_INLINES  -shared -Wl,-soname,libdso.so -o lib/libdso.so CMakeFiles/dso.dir/src/FullSystem/FullSystem.cpp.o CMakeFiles/dso.dir/src/FullSystem/FullSystemOptimize.cpp.o CMakeFiles/dso.dir/src/FullSystem/FullSystemOptPoint.cpp.o CMakeFiles/dso.dir/src/FullSystem/FullSystemDebugStuff.cpp.o CMakeFiles/dso.dir/src/FullSystem/FullSystemMarginalize.cpp.o CMakeFiles/dso.dir/src/FullSystem/Residuals.cpp.o CMakeFiles/dso.dir/src/FullSystem/CoarseTracker.cpp.o CMakeFiles/dso.dir/src/FullSystem/CoarseInitializer.cpp.o CMakeFiles/dso.dir/src/FullSystem/ImmaturePoint.cpp.o CMakeFiles/dso.dir/src/FullSystem/HessianBlocks.cpp.o CMakeFiles/dso.dir/src/FullSystem/PixelSelector2.cpp.o CMakeFiles/dso.dir/src/OptimizationBackend/EnergyFunctional.cpp.o CMakeFiles/dso.dir/src/OptimizationBackend/AccumulatedTopHessian.cpp.o CMakeFiles/dso.dir/src/OptimizationBackend/AccumulatedSCHessian.cpp.o CMakeFiles/dso.dir/src/OptimizationBackend/EnergyFunctionalStructs.cpp.o CMakeFiles/dso.dir/src/util/settings.cpp.o CMakeFiles/dso.dir/src/util/Undistort.cpp.o CMakeFiles/dso.dir/src/util/globalCalib.cpp.o CMakeFiles/dso.dir/src/IOWrapper/OpenCV/ImageDisplay_OpenCV.cpp.o CMakeFiles/dso.dir/src/IOWrapper/OpenCV/ImageRW_OpenCV.cpp.o CMakeFiles/dso.dir/src/IOWrapper/Pangolin/KeyFrameDisplay.cpp.o CMakeFiles/dso.dir/src/IOWrapper/Pangolin/PangolinDSOViewer.cpp.o CMakeFiles/dso.dir/src/dso_generated_teste.cu.o /usr/local/cuda/lib64/libcudart_static.a -lpthread -ldl -lrt

[ 96%] Building CXX object CMakeFiles/dso_dataset.dir/src/main_dso_pangolin.cpp.o
/usr/bin/c++   -DENABLE_SSE -DHAS_ZIPLIB=1 -I/usr/include/opencv -I/home/cesar/Documents/dso/src -I/home/cesar/Documents/dso/thirdparty/Sophus -I/home/cesar/Documents/dso/thirdparty/sse2neon -I/usr/include/eigen3 -I/home/cesar/Documents/Pangolin/include -I/home/cesar/Documents/Pangolin/build/src/include -I/usr/local/include -I/usr/include/suitesparse -I/usr/local/cuda/include  -O3 -g -std=c++11 -D_FORCE_INLINES   -o CMakeFiles/dso_dataset.dir/src/main_dso_pangolin.cpp.o -c /home/cesar/Documents/dso/src/main_dso_pangolin.cpp

I also tried to separate .cpp files from .cu files, used add_library for .cpp and cuda_add_library for .cu files, like this:

add_library(dso ${dso_SOURCE_FILES} ${dso_opencv_SOURCE_FILES} ${dso_pangolin_SOURCE_FILES})
cuda_add_library(my_cuda_lib ${PROJECT_SOURCE_DIR}/src/teste.cu)

And then use my_cuda_lib in target_link_libraries, like this:

target_link_libraries(dso_dataset dso boost_system cxsparse ${BOOST_THREAD_LIBRARY} ${LIBZIP_LIBRARY} ${Pangolin_LIBRARIES} ${OpenCV_LIBS} ${CUDA_LIBRARIES} my_cuda_lib)

But still got the same errors.

-- EDIT: MCVE ---

To demonstrate my error i created a simple example. I have 2 simple files, my main which is a .cpp and my cuda file .cu. My main just calls the function on the other file, looks like this:

#include <iostream>
#include "hello_world.cu"

using namespace std;

int main()
{

    teste();

    return 0;

}

And my .cu file looks like this:

#include <stdio.h>
#include <iostream>
// CUDA runtime
#include </usr/local/cuda-9.0/include/cuda_runtime.h>

// helper functions and utilities to work with CUDA
#include </usr/local/cuda-9.0/samples/common/inc/helper_functions.h>
#include </usr/local/cuda-9.0/samples/common/inc/helper_cuda.h>

__global__ void kernel (void){
  extern __shared__ float shared[];

  const int tid = threadIdx.x;
  const int bid = blockIdx.x;
}

int teste( void ) {
  kernel<<<1,1>>>();
  printf( "Hello, World!\n" ); 
  return 0;
}

My CMakeLists.txt that i made to compile this looks like this:

cmake_minimum_required(VERSION 2.8)
set(CUDA_HOST_COMPILER /usr/bin/g++-5)
find_package(CUDA QUIET REQUIRED)

# Pass options to NVCC
set(
    CUDA_NVCC_FLAGS
    ${CUDA_NVCC_FLAGS};
    -O3
    )

# For compilation ...
# Specify target & source files to compile it from
cuda_add_executable(
    helloworld
    hello_world.cu
    teste.cpp
)

After making cmake and running with "cmake --build ." (i don't know why it has to be this command, normally i just do make -j, but in this example only this works) i get the same errors as in my project, ‘threadIdx’ was not declared in this scope, same for 'blockIdx' etc..

César Pereira
  • 249
  • 4
  • 17
  • You should try to provide us with an [MCVE](https://stackoverflow.com/help/mcve). Basically, try to create a simple CMakeLists.txt file that will just build your CUDA file. There's a good chance you figure out what's the problem by yourself when trying to do that. – gflegar Apr 26 '18 at 23:38
  • In any case, how do you add your CUDA executable to the CMakeLists file? It seems to me that CMake might be trying to use a regular C++ compiler to compile your CUDA files. You can check this by running make with the verbose option (`make VERBOSE=1`), then you'll see the exact commands CMake is trying to use to compile your file. – gflegar Apr 26 '18 at 23:40
  • Another important note is that there are 2 ways to enable CUDA in the project: using the FindCUDA module (that's what you're using), or with the build-in support added in CMake 3.8. It's easy to get confused and mix the two approaches, which leads to problems. See [this question](https://stackoverflow.com/questions/49767485/cuda-cmake-3-10-cmakelists-txt/49836033#49836033) and my answer there for more details. – gflegar Apr 26 '18 at 23:44
  • Thanks for the comments @GoranFlegar. I'm really newbie at cmake, i never used it, and i'm trying to implement as you said a simple CMakeLists.txt for a simple cuda file and now even that i'm managing to do. I was looking at this: [link] (https://gist.github.com/ashwin/6547060), but only setting the flags and using add_executable and target_link_libraries but now working.. I'm 0 at cmake :/ – César Pereira Apr 27 '18 at 00:16
  • When i run with make VERBOSE = 1 in fact i see that the compiler being used is regular g++. I edited the post with the result – César Pereira Apr 27 '18 at 00:17
  • I managed to get working the simple example.. But cmakelists only uses find, set flags, and cuda_add_excutable. I still dont get what is wrong on SLAM CMakeLists.txt. I'm setting the flags, including directories and linking libraries and adding the executable.. – César Pereira Apr 27 '18 at 01:24
  • [mcve] should be **in the question post**, not linked. – Tsyvarev Apr 27 '18 at 08:49
  • Hello, i've added a MCVE to the post. Please check it. Thanks – César Pereira Apr 27 '18 at 13:11

1 Answers1

1

Since you are including hello_world.cu file in your main code, then you want to have it compiled with nvcc compiler. To achieve this change name of teste.cpp file to teste.cu (otherwise g++ will be used).

Also remove 'hello_world.cu' from CMakeLists.txt (it is included already in teste file) to have something like this:

cuda_add_executable(
    helloworld
    teste.cu
)

Then it should work.

-- EDIT: Additional question --

If you want to keep your .cpp file then you need kind of separation between what g++ can do for you and what nvcc should. So you can introduce to your project additional hello_world.h file:

#ifndef HELLO_WORLD_H
#define HELLO_WORLD_H

int teste();

#endif

include it in your teste.cpp:

#include <iostream>
#include "hello_world.h"

using namespace std;

int main()
{

    teste();

    return 0;

}

and then your CMakeLists.txt looks like in your original example:

...
cuda_add_executable(
    helloworld
    teste.cpp
    hello_world.cu
)

In such a case hello_world.cu will be compiled with nvcc, and then compilling and linking of teste.cpp will be done by g++ (which will be possible in that case since there is no CUDA code in teste.cpp).

Krzysztof
  • 769
  • 8
  • 27
  • You are right! It works that way. But what if i want to have my main file a .cpp file and use the other .cu files. How can i accomplish that? – César Pereira Apr 27 '18 at 14:07
  • @CésarPereira: I have edited my answer to address your question. – Krzysztof Apr 27 '18 at 14:15
  • That's perfect. Exactly what i wanted, i managed to adapt in my project and i am able to compile it with cuda now :) thank you so much – César Pereira Apr 27 '18 at 14:35
  • One more thing that doesn't have to do with this. On my test i'm using cudaMalloc and it returns a Unknow Error code 30, do you know why? I'm searched and could be because nvidia_uvm module was not loaded, but mine is.. – César Pereira Apr 27 '18 at 14:36
  • @CésarPereira: I have no idea - it seems that similiar problems are casued by having not compatibilie CUDA vs gpu drivers. I think that the best approach here would be try to compile some examples that are provieded with CUDA (in samples dir) - for example deviceQuery. If that does not work you probably would like to first re-check and/or update your drivers and CUDA version. – Krzysztof Apr 27 '18 at 20:48