I ran the following code in cufft (cuda 9) (Nvidia 1080). The code is same for all execution. However, the execution time (below the code) varies a lot. Can anyone please describe how to get the lowest time always and the reason behind this behavior?
int NX 2048
int BATCH 96
cufftHandle plan;
cufftHandle rev_plan;
cufftDoubleReal *idata;
cufftDoubleComplex *odata;
int BLOCKSIZE = 1024;
int gridSize = (NX * BATCH)/BLOCKSIZE;
cufftPlan1d(&plan, NX, CUFFT_D2Z, BATCH);
cufftPlan1d(&rev_plan, NX, CUFFT_Z2D, BATCH);
cudaMalloc((void **) &idata, sizeof(cufftDoubleReal) * NX * BATCH);
cudaMalloc((void **) &odata, sizeof(cufftDoubleComplex) * (NX / 2 + 1) * BATCH);
inputData << < gridSize, BLOCKSIZE >> > (idata, NX * BATCH);
double sT = omp_get_wtime();
for (int i = 0; i < 500; ++i) {
cufftExecD2Z(plan, idata, odata);
cufftExecZ2D(plan, odata, idata);
}
printf("Time taken: %f\n", omp_get_wtime() - sT);
sT = omp_get_wtime();
for (int i = 0; i < 500; ++i) {
cufftExecD2Z(plan, idata, odata);
cufftExecZ2D(plan, odata, idata);
}
printf("Time taken: %f\n", omp_get_wtime() - sT);
sT = omp_get_wtime();
for (int i = 0; i < 500; ++i) {
cufftExecD2Z(plan, idata, odata);
cufftExecZ2D(plan, odata, idata);
}
printf("Time taken: %f\n", omp_get_wtime() - sT);
sT = omp_get_wtime();
for (int i = 0; i < 500; ++i) {
cufftExecD2Z(plan, idata, odata);
cufftExecZ2D(plan, odata, idata);
}
printf("Time taken: %f\n", omp_get_wtime() - sT);
cudaFree(idata);
cudaFree(odata);
Time taken: 0.004334 Time taken: 0.022906 Time taken: 0.027820 Time taken: 0.027786