I am having the following performance problem with CUDA. When I run a simple sample code on a Titan V and Titan X card, the running times are fine as expected.
Titan X: 0.269299 ms
Titan V: 0.111766 ms
Now, when I add another kernel in the code, which uses dynamic parallelism, but still do not call it or use it at all, the performance in Volta GPU goes down drastically but on other cards the performance is not affected.
Titan X: 0.270602 ms
Titan V: 1.999299 ms
It is important to put emphasis on the fact that this second kernel is not used at all, it just sits next to the rest of the code, i.e., it is only compiled with the rest of the code. One can also comment the recursive kernel calls along with the stream creation, and see that the running times for Volta become good again. I suspect that the presence of dynamic parallelism has a negative effect on the code, even when it is not used at all ar runtime. Any ideas on how to approach this problem?
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
__global__ void MetNormal(int *a_d,const int N );
__global__ void rec(int *a_d ,int Nm,int xi, int yi, int zi, const int BlockSize){
int x=blockIdx.x*blockDim.x+threadIdx.x+xi;
int y=blockIdx.y*blockDim.y+threadIdx.y+yi;
int z=blockIdx.z*blockDim.z+threadIdx.z+zi;
int Nbloques= (Nm+BlockSize-1)/BlockSize;
dim3 b(BlockSize,BlockSize,BlockSize);
dim3 g(Nbloques,Nbloques,Nbloques);
cudaStream_t s1;//line of code to comment
cudaStreamCreateWithFlags(&s1,cudaStreamNonBlocking);// line of code to comment
rec<<<g,b,0,s1>>>(a_d,Nm,xi,yi,zi,BlockSize);//line of code to comment
}
__global__ void MetNormal(int *a_d,const int N){
int x= blockIdx.x*blockDim.x+threadIdx.x;
int y= blockIdx.y*blockDim.y+threadIdx.y;
int z= blockIdx.z*blockDim.z+threadIdx.z;
int ind=z*N*N+y*N+x;
a_d[ind]=1;
}
int main(int argc ,char **argv){
if (argc !=4){
fprintf(stderr,"Error, run program as ./prog N rep device\n");
exit(EXIT_FAILURE);
}
unsigned long N=atoi(argv[1]);
unsigned long rep=atoi(argv[2]);
cudaSetDevice(atoi(argv[3]));
int *a,*a_d, xi=0, yi=0,zi=0;
int BSize=8;
a=(int*)malloc(sizeof(int)*N*N*N);
cudaMalloc((void ** ) &a_d,N*N*N*sizeof(int));
dim3 Bloque(BSize,BSize,BSize);
float NB=(float)N/(float)(2*BSize);
int B=(int) ceil(NB);
dim3 GridBruto((N+BSize-1)/BSize,(N+BSize-1)/BSize,(N+BSize-1)/BSize);
fflush(stdout);
for(int i=0;i<N;i++){
for (int j=0;j<N;j++){
for(int k=0;k<N;k++){
a[N*N*k+i*N+j]=0;
}
}
}
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaMemcpy(a_d,a,N*N*N*sizeof(int),cudaMemcpyHostToDevice);
cudaEventRecord(start);
for(int i =0;i<rep;i++){
MetNormal<<<GridBruto,Bloque>>>(a_d,N);
cudaDeviceSynchronize();
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("Time %f ms\n", milliseconds/(rep));
fflush(stdout);
cudaDeviceSynchronize();
cudaMemcpy(a,a_d,N*N*N*sizeof(int),cudaMemcpyDeviceToHost);
return 0;
}
compilation line:
nvcc -O3 -std=c++11 -lm -arch sm_60 -rdc=true -lcudadevrt prog.cu -o prog