I have an OpenMP code segment (parallel region). I want to add inline PTX assembly at the indicated line (shown below)
I am using Nvidia gtx1070 gpu and ubuntu Linux and clang compiler. I tried asm volatile()
method, but clang compiler throws compile error (shown below)
For example, If I call the following function warp_id() inside a CUDA kernel, we can extract the warp_id of the current thread execution:
__forceinline__ __device__ unsigned warp_id()
{
unsigned ret;
asm volatile ("mov.u32 %0, %warpid;" : "=r"(ret));
return ret;
}
I would like to do the same but inside an OpenMP parallel region.
#pragma omp target teams distribute num_teams(15)
for(int i=0; i<R; i++)
{
#pragma omp parallel for default(shared) schedule(auto)
for(int j = 0; j < C; j++)
{
// extract warpid of this current thread iteration
}
}
Clang Error:
"/usr/local/cuda-9.2/bin/ptxas" -m64 -O3 -v --gpu-name sm_61 --output-file /tmp/openmp_ptxasm-1fb0b2.cubin /tmp/openmp_ptxasm-07f1f7.s -c
ptxas info : 59 bytes gmem
ptxas info : Compiling entry function '__omp_offloading_817_6600b8_main_l32' for 'sm_61'
ptxas info : Function properties for __omp_offloading_817_6600b8_main_l32
56 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 72 registers, 416 bytes cmem[0], 4 bytes cmem[2]
ptxas info : Function properties for _ZN27omptarget_nvptx_LoopSupportIllE13dispatch_nextEPiPlS2_S2_
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
"/usr/local/cuda-9.2/bin/nvlink" -o /tmp/openmp_ptxasm-0376ca.out -v -arch sm_61 -L. -L/home/mglab1/Desktop/llvm_new/install/lib -L/usr/local/cuda-9.2/lib64 -L/home/mglab1/Desktop/llvm_new/build/lib -lomptarget-nvptx /tmp/openmp_ptxasm-1fb0b2.cubin
nvlink info : 703720068 bytes gmem
nvlink info : Function properties for '__omp_offloading_817_6600b8_main_l32':
nvlink info : used 72 registers, 56 stack, 1084 bytes smem, 416 bytes cmem[0], 4 bytes cmem[2], 0 bytes lmem
"/home/mglab1/Desktop/llvm_new/build/bin/clang-7" -cc1 -triple x86_64-unknown-linux-gnu -emit-obj -disable-free -disable-llvm-verifier -discard-value-names -main-file-name openmp_ptxasm.cpp -mrelocation-model static -mthread-model posix -fmath-errno -masm-verbose -mconstructor-aliases -munwind-tables -fuse-init-array -target-cpu x86-64 -dwarf-column-info -debugger-tuning=gdb -momit-leaf-frame-pointer -v -resource-dir /home/mglab1/Desktop/llvm_new/build/lib/clang/7.0.0 -O3 -Wall -fdebug-compilation-dir /home/mglab1/Codes/cuda -ferror-limit 19 -fmessage-length 80 -fopenmp -fobjc-runtime=gcc -fdiagnostics-show-option -fcolor-diagnostics -vectorize-loops -vectorize-slp -o /tmp/openmp_ptxasm-34c034.o -x ir /tmp/openmp_ptxasm-8ccc6a.bc -fopenmp-targets=nvptx64-nvidia-cuda -faddrsig
clang -cc1 version 7.0.0 based upon LLVM 7.0.0 default target x86_64-unknown-linux-gnu
<inline asm>:1:10: error: invalid register name
mov.u32 %0, %smid;
^~
error: cannot compile inline asm
1 error generated.
This error was generated if I use inline PTX in the inner loop like this:
#pragma omp parallel for default(shared) schedule(auto)
for(int j = 0; j <C; j++)
{
int thread_id = omp_get_thread_num();
unsigned int wid;
asm volatile("mov.u32 %0, %%warpid;" : "=r"(wid));
printf("Iteration= c[ %d ][ %d ], Team=%d, Thread=%d, warp=%d\n",n, j, team_id, thread_id, wid);
S[n][j] = A[n][j] * B[n][j];
}
Compile cmd: clang++ -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_61 -Wall -O3 openmp.cpp