Why am I unable to establish a pipeline when using multiple concurrent streams in CUDA programming?

Question

I wish to construct a pipeline using multiple streams. Below is the code I have written:

using namespace std;

__global__ void vecAdd(float *c, const float *a, const float *b);

void initBuffer(float *data, int size);

int main() {
    int size = 1 << 22;
    int bufsize = size * sizeof(float);
    int nStream = 4;

    float* ha[4];
    float* hb[4];
    float* hc[4];
    float* da[4];
    float* db[4];
    float* dc[4];

    srand(2019);
    for (int i = 0; i < 4; ++i) {
        cudaMallocHost((void **) &ha[i], bufsize);
        cudaMallocHost((void **) &hb[i], bufsize);
        cudaMallocHost((void **) &hc[i], bufsize);
        initBuffer(ha[i], size);
        initBuffer(hb[i], size);

        cudaMalloc((void **) &da[i], bufsize);
        cudaMalloc((void **) &db[i], bufsize);
        cudaMalloc((void **) &dc[i], bufsize);
    }

    auto *streams = new cudaStream_t[nStream];
    for (int i = 0; i < nStream; i++) {
        cudaStreamCreate(&streams[i]);
    }

    for (int i = 0; i < nStream; i++) {
        cudaMemcpyAsync(da[i], ha[i], bufsize, cudaMemcpyHostToDevice, streams[i]);
        cudaMemcpyAsync(db[i], hb[i], bufsize, cudaMemcpyHostToDevice, streams[i]);

        dim3 dimBlock(256);
        dim3 dimGrid(size / dimBlock.x);
        vecAdd<<< dimGrid, dimBlock, 0, streams[i] >>>(dc[i], da[i], db[i]);
        cudaMemcpyAsync(hc[i], dc[i], bufsize, cudaMemcpyDeviceToHost, streams[i]);
    }

    cudaDeviceSynchronize();

    // terminate operators
    delete[] streams;

    for (int i = 0; i < 4; ++i) {

        // terminate device memories
        cudaFree(da[i]);
        cudaFree(db[i]);
        cudaFree(dc[i]);

        // terminate host memories
        cudaFreeHost(ha[i]);
        cudaFreeHost(hb[i]);
        cudaFreeHost(hc[i]);
    }

    return 0;
}

void initBuffer(float *data, const int size) {
    for (int i = 0; i < size; i++)
        data[i] = rand() / (float) RAND_MAX;
}

__global__ void vecAdd(float *c, const float *a, const float *b) {
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    for (int i = 0; i < 200; i++)
        c[idx] = a[idx] + b[idx];
}

However, as shown in the diagram below, when I analyze it using Nsight Systems, it appears to be executed in a serial manner.

However, when I modify the code as shown below and rewrite the section related to D2H in a loop, it forms a pipeline. Why does this occur?

    auto *streams = new cudaStream_t[nStream];
    for (int i = 0; i < nStream; i++) {
        cudaStreamCreate(&streams[i]);
    }

    for (int i = 0; i < nStream; i++) {
        cudaMemcpyAsync(da[i], ha[i], bufsize, cudaMemcpyHostToDevice, streams[i]);
        cudaMemcpyAsync(db[i], hb[i], bufsize, cudaMemcpyHostToDevice, streams[i]);

        dim3 dimBlock(256);
        dim3 dimGrid(size / dimBlock.x);
        vecAdd<<< dimGrid, dimBlock, 0, streams[i] >>>(dc[i], da[i], db[i]);
//        cudaMemcpyAsync(hc[i], dc[i], bufsize, cudaMemcpyDeviceToHost, streams[i]);
    }

    for (int i = 0; i < nStream; ++i)
        cudaMemcpyAsync(hc[i], dc[i], bufsize, cudaMemcpyDeviceToHost, streams[i]);

    cudaDeviceSynchronize();

I would appreciate someone pointing out the reason behind the inability of my initial code segment to form a pipeline.

I appreciate your feedback. After considering your comments, I conducted tests on four different devices. Initially, I ran the code on a 3060 GPU with Windows. Subsequently, I performed tests on a 2080TI GPU with Linux, a 3090 GPU with Linux, and an A100 GPU with Linux. Surprisingly, I discovered that the inability to establish a pipeline only occurred in the 3060-Windows environment. This discrepancy is quite perplexing. Here are the analysis results for each device:

3060-Windows: Unable to establish a pipeline.
2080TI-Linux: Pipeline successfully established.

3090-Linux: Pipeline successfully established.

A100-Linux: Pipeline successfully established.

In both cases, you need to use cudaMallocHost instead of malloc to allocate host buffers for asynchronous copies in both directions for any transfer size. — Abator Abetor, Jun 14 '23 at 06:55
I appreciate the previous comment. However, even after switching to using cudaMallocHost for memory allocation, parallelization remains unattainable. — Aitar, Jun 14 '23 at 07:56
you can try enabling [GPU hardware scheduling](https://www.makeuseof.com/windows-10-gpu-hardware-scheduling-worth-turning-on/) in windows — Robert Crovella, Jun 14 '23 at 14:40

score 1 · Accepted Answer · answered Jun 14 '23 at 09:42

This seems to be a problem with the windows WDDM driver (used for cards that have presentation capabilities) that does batching on the commands sent to the card.
In the NVIDIA CUDA guide (section 2.4) it is recommended to set the driver in TCC mode (for cards that do only compute operations) but this means that you lose the ability to use the graphic card as a presentation device (so the card will not be able to present to the screen).

This question presents the same problem.

Why am I unable to establish a pipeline when using multiple concurrent streams in CUDA programming?

1 Answers1