Performance slowdown in Titan V in the presence of Dynamic Parallelism code

Question

I am having the following performance problem with CUDA. When I run a simple sample code on a Titan V and Titan X card, the running times are fine as expected.

Titan X: 0.269299 ms
Titan V: 0.111766 ms

Now, when I add another kernel in the code, which uses dynamic parallelism, but still do not call it or use it at all, the performance in Volta GPU goes down drastically but on other cards the performance is not affected.

Titan X: 0.270602 ms
Titan V: 1.999299 ms

It is important to put emphasis on the fact that this second kernel is not used at all, it just sits next to the rest of the code, i.e., it is only compiled with the rest of the code. One can also comment the recursive kernel calls along with the stream creation, and see that the running times for Volta become good again. I suspect that the presence of dynamic parallelism has a negative effect on the code, even when it is not used at all ar runtime. Any ideas on how to approach this problem?

#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
__global__ void MetNormal(int *a_d,const int N );
__global__ void rec(int *a_d ,int Nm,int xi, int yi, int zi, const int BlockSize){

    int x=blockIdx.x*blockDim.x+threadIdx.x+xi;
    int y=blockIdx.y*blockDim.y+threadIdx.y+yi;
    int z=blockIdx.z*blockDim.z+threadIdx.z+zi;

    int Nbloques= (Nm+BlockSize-1)/BlockSize;
    dim3 b(BlockSize,BlockSize,BlockSize);
    dim3 g(Nbloques,Nbloques,Nbloques);

    cudaStream_t s1;//line of code to comment
    cudaStreamCreateWithFlags(&s1,cudaStreamNonBlocking);// line of code to comment
    rec<<<g,b,0,s1>>>(a_d,Nm,xi,yi,zi,BlockSize);//line of code to comment

}
__global__ void MetNormal(int *a_d,const int N){

    int x= blockIdx.x*blockDim.x+threadIdx.x;
    int y= blockIdx.y*blockDim.y+threadIdx.y;
    int z= blockIdx.z*blockDim.z+threadIdx.z;
    int ind=z*N*N+y*N+x;

   a_d[ind]=1;
}
int main(int argc ,char **argv){

    if (argc !=4){
        fprintf(stderr,"Error, run program as ./prog N rep device\n");
        exit(EXIT_FAILURE);
    }

    unsigned long N=atoi(argv[1]);
    unsigned long rep=atoi(argv[2]);
    cudaSetDevice(atoi(argv[3]));
    int *a,*a_d, xi=0, yi=0,zi=0;
    int BSize=8;

    a=(int*)malloc(sizeof(int)*N*N*N);
    cudaMalloc((void ** ) &a_d,N*N*N*sizeof(int));

    dim3 Bloque(BSize,BSize,BSize);
    float NB=(float)N/(float)(2*BSize);
    int B=(int) ceil(NB);

    dim3 GridBruto((N+BSize-1)/BSize,(N+BSize-1)/BSize,(N+BSize-1)/BSize);

    fflush(stdout);
    for(int i=0;i<N;i++){
        for (int j=0;j<N;j++){
            for(int k=0;k<N;k++){
                a[N*N*k+i*N+j]=0;
            }
        }
    }

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaMemcpy(a_d,a,N*N*N*sizeof(int),cudaMemcpyHostToDevice); 

    cudaEventRecord(start);
    for(int i =0;i<rep;i++){
        MetNormal<<<GridBruto,Bloque>>>(a_d,N); 
        cudaDeviceSynchronize();
        }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);               
    printf("Time %f ms\n", milliseconds/(rep));
    fflush(stdout);

    cudaDeviceSynchronize();
    cudaMemcpy(a,a_d,N*N*N*sizeof(int),cudaMemcpyDeviceToHost);
    return 0;
}

compilation line:

nvcc -O3 -std=c++11 -lm -arch sm_60 -rdc=true -lcudadevrt prog.cu -o prog

What happens if you compile the code for the Volta architecture? — talonmies, Oct 11 '18 at 12:18
I could reproduce the problem using a Tesla V100 and a Titan V. — Cristobal Navarro, Oct 26 '18 at 13:37
This is a known side-effect of using CDP and is documented in the CUDA programming guide. — Robert Crovella, Apr 09 '22 at 00:06

Performance slowdown in Titan V in the presence of Dynamic Parallelism code

0 Answers0