While investigating some issues with relocatable device code, I stumbled upon something I don't quite understand.
This is a use case for what is pictured on slide 6. I used an answer of Robert Crovella as a basis for a repro code. The idea is that we have some relocatable device code compiled into a static library (e.g. some math/toolbox library), and we want to use some functions of that precompiled library into another device library of our program:
libutil.a ---> libtest.so ---> test_pgm
Let's say that this external library contains the following function:
__device__ int my_square (int a);
libutil.a
was for instance generated as follow (in another project):
nvcc ${NVCC_FLAGS} -dc util.cu
nvcc ${NVCC_FLAGS} -dlink util.o -o util_dlink.o
nvcc ${NVCC_FLAGS} -lib util_dlink.o util.o -o libutil.a
Then, in our project, to generate libtest.so
:
nvcc ${NVCC_FLAGS} -dc test.cu
nvcc ${NVCC_FLAGS} -dlink test.o libutil.a -o test_dlink.o
g++ -shared -Wl,-soname,libtest.so -o libtest.so test.o test_dlink.o libutil.a -L${CUDA_LIBDIR} -lcudart
But I get the following error when generating test_dlink.o
:
nvlink error : Undefined reference to '_Z9my_squarei' in 'test.o'
The linker does not find our dummy my_square(int)
function. If we instead use (assuming we had access to util.o
):
nvcc ${NVCC_FLAGS} -dlink test.o util.o -o test_dlink.o
The linker succeeds and everything works fine after.
Investigating further:
$ nm -C libutil.a
util_dlink.o:
U atexit
U __cudaRegisterFatBinary
0000000000000015 T __cudaRegisterLinkedBinary_39_tmpxft_0000106a_00000000_6_util_cpp1_ii_843d693d
...
util.o:
U __cudaInitModule
U __cudaRegisterLinkedBinary_39_tmpxft_0000106a_00000000_6_util_cpp1_ii_843d693d
...
0000000000000015 T my_square(int)
...
The symbol is there in the archived util.o
, but nvlink
(called by nvcc
) does not seem to find it. Why is that? According to the official documentation:
The device linker has the ability to read the static host library formats (.a on Linux and Mac, .lib on Windows).
We could of course extract the object file and link with it:
ar x libutil.a `ar t libutil.a | grep -v "dlink"`
nvcc ${NVCC_FLAGS} -dlink test.o util.o -o test_dlink.o
But this does not feel like the expected solution... So what am I missing here? Another nvcc
option that solves that? Is there an error when generating libutil.a
and/or libtest.so
?
Note that this was tested with CUDA 6.5 on Arch Linux.
EDIT: fixed repro code with commented lines
Makefile
NVCC_FLAGS=-m64 -arch=sm_20 -Xcompiler '-fPIC'
CUDA_LIBDIR=${CUDA_HOME}/lib64
testmain : main.cpp libtest.so
g++ -c main.cpp
g++ -o testmain -L. -ldl -Wl,-rpath,. -ltest -L${CUDA_LIBDIR} -lcudart main.o
libutil.a : util.cu util.cuh
nvcc ${NVCC_FLAGS} -dc util.cu
# ---> FOLLOWING LINES THAT WERE WRONG <---
# nvcc ${NVCC_FLAGS} -dlink util.o -o util_dlink.o
# nvcc ${NVCC_FLAGS} -lib util.o util_dlink.o -o libutil.a
# INSTEAD:
nvcc ${NVCC_FLAGS} -lib util.o -o libutil.a
# Assuming util is an external library, so util.o is not available
rm util.o
libtest.so : test.cu test.h libutil.a util.cuh
nvcc ${NVCC_FLAGS} -dc test.cu
# Use NVCC for device linking + G++
nvcc -v ${NVCC_FLAGS} -dlink test.o libutil.a -o test_dlink.o
g++ -shared -o libtest.so test.o test_dlink.o libutil.a -L${CUDA_LIBDIR} -lcudart
# Or let NVCC generate the shared library
#nvcc -v ${NVCC_FLAGS} -shared -L. -lutil test.o -o libtest.so
clean :
rm -f testmain *.o *.a *.so
test.h
#ifndef TEST_H
# define TEST_H
int my_test_func();
#endif //! TEST_H
test.cu
#include <stdio.h>
#include "test.h"
#include "util.cuh"
#define DSIZE 1024
#define DVAL 10
#define SQVAL 3
#define nTPB 256
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void my_kernel(int *data){
int idx = threadIdx.x + (blockDim.x *blockIdx.x);
if (idx < DSIZE) data[idx] =+ DVAL + my_square (SQVAL);
}
int my_test_func()
{
int *d_data, *h_data;
h_data = (int *) malloc(DSIZE * sizeof(int));
if (h_data == 0) {printf("malloc fail\n"); exit(1);}
cudaMalloc((void **)&d_data, DSIZE * sizeof(int));
cudaCheckErrors("cudaMalloc fail");
for (int i = 0; i < DSIZE; i++) h_data[i] = 0;
cudaMemcpy(d_data, h_data, DSIZE * sizeof(int), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy fail");
my_kernel<<<((DSIZE+nTPB-1)/nTPB), nTPB>>>(d_data);
cudaDeviceSynchronize();
cudaCheckErrors("kernel");
cudaMemcpy(h_data, d_data, DSIZE * sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2");
for (int i = 0; i < DSIZE; i++)
if (h_data[i] != DVAL + SQVAL*SQVAL)
{
printf("Results check failed at offset %d, data was: %d, should be %d\n",
i, h_data[i], DVAL);
exit(1);
}
printf("Results check passed!\n");
return 0;
}
util.cuh
#ifndef UTIL_CUH
# define UTIL_CUH
__device__ int my_square (int a);
#endif //! UTIL_CUH
util.cu
#include "util.cuh"
__device__ int my_square (int a)
{
return a * a;
}
main.cpp
#include "test.h"
int main()
{
my_test_func();
return 0;
}