I am currently learning CUDA streams through the computation of a dot product between two vectors. The ingredients are a kernel function that takes in vectors x and y and returns a vector result of size equal to the number of blocks, where each block contributes its own reduced sum.
I also have a host function dot_gpu that calls the kernel and reduces the vector result to the final dot product value.
The synchronous version does just this:
// copy to device
copy_to_device<double>(x_h, x_d, n);
copy_to_device<double>(y_h, y_d, n);
// kernel
double result = dot_gpu(x_d, y_d, n, blockNum, blockSize);
while the async one goes like:
double result[numChunks];
for (int i = 0; i < numChunks; i++) {
int offset = i * chunkSize;
// copy to device
copy_to_device_async<double>(x_h+offset, x_d+offset, chunkSize, stream[i]);
copy_to_device_async<double>(y_h+offset, y_d+offset, chunkSize, stream[i]);
// kernel
result[i] = dot_gpu(x_d+offset, y_d+offset, chunkSize, blockNum, blockSize, stream[i]);
}
for (int i = 0; i < numChunks; i++) {
finalResult += result[i];
cudaStreamDestroy(stream[i]);
}
I am getting worse performance when using streams and was trying to investigate the reasons. I tried to pipeline the downloads, kernel calls and uploads, but with no results.
// accumulate the result of each block into a single value
double dot_gpu(const double *x, const double* y, int n, int blockNum, int blockSize, cudaStream_t stream=NULL)
{
double* result = malloc_device<double>(blockNum);
dot_gpu_kernel<<<blockNum, blockSize, blockSize * sizeof(double), stream>>>(x, y, result, n);
#if ASYNC
double* r = malloc_host_pinned<double>(blockNum);
copy_to_host_async<double>(result, r, blockNum, stream);
CudaEvent copyResult;
copyResult.record(stream);
copyResult.wait();
#else
double* r = malloc_host<double>(blockNum);
copy_to_host<double>(result, r, blockNum);
#endif
double dotProduct = 0.0;
for (int i = 0; i < blockNum; i ++) {
dotProduct += r[i];
}
cudaFree(result);
#if ASYNC
cudaFreeHost(r);
#else
free(r);
#endif
return dotProduct;
}
My guess is that the problem is inside the dot_gpu() functions that doesn't only call the kernel. Tell me if I understand correctly the following stream executions
foreach stream {
cudaMemcpyAsync( device[stream], host[stream], ... stream );
LaunchKernel<<<...stream>>>( ... );
cudaMemcpyAsync( host[stream], device[stream], ... stream );
}
The host executes all the three instructions without being blocked, since cudaMemcpyAsync and kernel return immediately (however on the GPU they will execute sequentially as they are assigned to the same stream). So host goes on to the next stream (even if stream1 who knows what stage it is at, but who cares.. it's doing his job on the GPU, right?) and executes the three instructions again without being blocked.. and so on and so forth. However, my code blocks the host before it can process the next stream, somewhere inside the dot_gpu() function. Is it because I am allocating & freeing stuff, as well as reducing the array returned by the kernel to a single value?