I am trying to do some benchmarking to ensure using CUDA's Unified Memory(UM) approach will not hurt us wrt performance.
I am performing an FFT. One way i use UM, one way i use the cudaMalloc
I compare the results afterwards and they all match up (which is good).
however, the timing i'm getting for the UM approach is ~.5ms vs the cudaMalloc way of ~.04 (after performing the run multiple times an averaging)
I am using Event records to do the timing. I have one right before and after the cufftExecC2C call.
Furthermore, I added two more event records to measure the time before any memory transfer to the device, and after using the data once i get it back from the device.
when doing this, i see the UM approach take ~1.6ms and the cudaMalloc approach taking ~.7.
Below is a snippet of code that does the UM approach:
cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);
cudaMallocManaged(&inData, dataSize * sizeof(cufftComplex));
cudaMallocManaged(&outData, dataSize * sizeof(cufftComplex));
cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);
setupWave(dataSize, inData);
cudaEventRecord(start_before_memHtoD);
cudaMemPrefetchAsync(inData, dataSize * sizeof(cufftComplex), 1);
cudaDeviceSynchronize();
cudaEventRecord(start_kernel);
cufftExecC2C(plan, inData, outData, CUFFT_FORWARD);
cudaEventRecord(stop_kernel);
cudaEventSynchronize(stop_kernel);
float sum = 0;
for (int i = 0; i < dataSize; i++) {
sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);
std::cout << "sum for UM is " << sum << std::endl;
float umTime = 0;
float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
stop_after_memDtoH);
resultString_um += std::to_string(dataSize) + " samples took "
+ std::to_string(umTime) + "ms, Overall: "
+ std::to_string(overallUmTime) + "\n";
cudaFree(outData);
cudaFree(inData);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);
cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);
cufftDestroy(plan);
The following is for the cudaMalloc approach
cufftComplex *d_inData;
cufftComplex *d_outData;
inData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
outData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
cudaMalloc((void**) (&d_inData), dataSize * sizeof(cufftComplex));
cudaMalloc((void**) (&d_outData), dataSize * sizeof(cufftComplex));
cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);
cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);
setupWave(dataSize, inData);
cudaEventRecord(start_before_memHtoD);
cudaMemcpy(d_inData, inData, dataSize * sizeof(cufftComplex),
cudaMemcpyHostToDevice);
cudaEventRecord(start_kernel);
cufftExecC2C(plan, d_inData, d_outData, CUFFT_FORWARD);
cudaEventRecord(stop_kernel);
cudaEventSynchronize(stop_kernel);
cudaMemcpy(outData, d_outData, dataSize * sizeof(cufftComplex),
cudaMemcpyDefault);
cudaEventRecord(stop_after_memDtoH);
float sum = 0;
for (int i = 0; i < dataSize; i++) {
sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);
std::cout << "sum for UM is " << sum << std::endl;
float umTime = 0;
float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
stop_after_memDtoH);
resultString_um += std::to_string(dataSize) + " samples took "
+ std::to_string(umTime) + "ms, Overall: "
+ std::to_string(overallUmTime) + "\n";
cudaFree(outData);
cudaFree(inData);
cudaFree(d_outData);
cudaFree(d_inData);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);
cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);
cufftDestroy(plan);
Is there something else I could be doing when using the unified memory approach to speed it up? I expected UM to be slower, but not by this much.
We are using the P100 on redhat 7.3 with Cuda 9