I use CUDA 6.5 and 4 x GPUs Kepler.
I use multithreading, CUDA runtime API and access to the CUDA contexts from different CPU threads (by using OpenMP - but it does not really matter).
When I call
cudaDeviceSynchronize();
will it wait for kernel(s) to finish only in current CUDA context which selected by the latest callcudaSetDevice()
, or in all CUDA contexts?If it will wait for kernel(s) to finish in all CUDA contexts, then it will wait in all CUDA contexts which used in current CPU thread (in example CPU thread_0 will wait GPUs: 0 and 1) or generally all CUDA contexts (CPU thread_0 will wait GPUs: 0, 1, 2 and 3)?
Following code:
// For using OpenMP requires to set:
// MSVS option: -Xcompiler "/openmp"
// GCC option: –Xcompiler –fopenmp
#include <omp.h>
int main() {
// execute two threads with different: omp_get_thread_num() = 0 and 1
#pragma omp parallel num_threads(2)
{
int omp_threadId = omp_get_thread_num();
// CPU thread 0
if(omp_threadId == 0) {
cudaSetDevice(0);
kernel_0<<<...>>>(...);
cudaSetDevice(1);
kernel_1<<<...>>>(...);
cudaDeviceSynchronize(); // what kernel<>() will wait?
// CPU thread 1
} else if(omp_threadId == 1) {
cudaSetDevice(2);
kernel_2<<<...>>>(...);
cudaSetDevice(3);
kernel_3<<<...>>>(...);
cudaDeviceSynchronize(); // what kernel<>() will wait?
}
}
return 0;
}