cuda understanding concurrent kernel execution

Question

I'm trying to understand how concurrent kernel execution works. I have written a simple program to try to understand it. The kernel will populate a 2D array using 2 streams. I am getting the correct results when there is 1 stream, no concurrency. when i try it with 2 streams, attempt at concurrency, i get the wrong results. I believe it's either something to do with the memory transfer as i'm not quite sure i have this correct or the way i have set up the kernel. The programming guide does not explain it well enough for me. For my purposes, i need Matlab to be calling the kernel.

As i understand it, the main program will:

allocate the pinned memory on host
allocate the memory on the GPU required for a single stream (2 streams = half the total memory of the host)
create the streams
loop through the streams
copy the memory for a single stream from host to the device using cudaMemcpyAsync()
execute kernel for the stream
copy the memory for the stream back to the host, cudaMemcpyAsync()
- I believe i'm doing the right thing by referencing the memory from the location i need it for each stream using an offset based on the size of data for each stream and the stream number.
destroy the streams
free the memory

here is the code i am attempting to use.

concurrentKernel.cpp

__global__ void concurrentKernel(int const width, 
                                  int const streamIdx,
                                  double *array)
 {
     int thread = (blockIdx.x * blockDim.x) + threadIdx.x;;

     for (int i = 0; i < width; i ++)
     {
        array[thread*width+i] = thread+i*width+1;
//         array[thread*width+i+streamIdx] = thread+i*width+streamIdx*width/2;
     }

 }

concurrentMexFunction.cu

#include <stdio.h>
#include <math.h>
#include "mex.h"

/* Kernel function */
#include "concurrentKernel.cpp"


void mexFunction(int        nlhs,
                 mxArray    *plhs[],
                 int        nrhs,
                 mxArray    *prhs[])
{

    int const numberOfStreams = 2; // set number of streams to use here.
    cudaError_t cudaError;
    int offset;

    int width, height, fullSize, streamSize;
    width = 512;
    height = 512;
    fullSize = height*width;
    streamSize = (int)(fullSize/numberOfStreams);
    mexPrintf("fullSize: %d, streamSize: %d\n",fullSize, streamSize);

    /* Return the populated array */
    double *returnedArray;
    plhs[0] = mxCreateDoubleMatrix(height, width, mxREAL);
    returnedArray = mxGetPr(plhs[0]);

    cudaStream_t stream[numberOfStreams];
    for (int i = 0; i < numberOfStreams; i++)
    {
        cudaStreamCreate(&stream[i]);    
    }

    /* host memory */
    double *hostArray;
    cudaError = cudaMallocHost(&hostArray,sizeof(double)*fullSize);    // full size of array.
    if (cudaError != cudaSuccess) {mexPrintf("hostArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }

    for (int i = 0; i < height; i++)
    {
        for (int j = 0; j < width; j++)
        {
            hostArray[i*width+j] = -1.0;
        }
    }

    /* device memory */
    double *deviceArray;
    cudaError = cudaMalloc( (void **)&deviceArray,sizeof(double)*streamSize);    // size of array for each stream.
    if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }


    for (int i = 0; i < numberOfStreams; i++)
    {
        offset = i;//*streamSize;
        mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset);

        cudaMemcpyAsync(deviceArray, hostArray+offset, sizeof(double)*streamSize, cudaMemcpyHostToDevice, stream[i]);
        if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }

        concurrentKernel<<<1, 512, 0, stream[i]>>>(width, i, deviceArray);

        cudaMemcpyAsync(returnedArray+offset, deviceArray, sizeof(double)*streamSize, cudaMemcpyDeviceToHost, stream[i]);
        if (cudaError != cudaSuccess) {mexPrintf("returnedArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }

        mexPrintf("returnedArray[offset]: %g, [end]: %g\n",returnedArray[offset/sizeof(double)],returnedArray[(i+1)*streamSize-1]);
    }


    for (int i = 0; i < numberOfStreams; i++)
    {
        cudaStreamDestroy(stream[i]);    
    }

    cudaFree(hostArray);
    cudaFree(deviceArray);

}

When there is 2 streams, the result is an array of zeros, which makes me think its i'm doing something wrong with the memory. Can anyone explain what i am doing wrong? If anyone needs help compiling and running these from Matlab, i can provide the commands to do so.

Update:

for (int i = 0; i < numberOfStreams; i++)
{
    offset = i*streamSize;
    mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset);

    cudaMemcpyAsync(deviceArray, hostArray+offset, sizeof(double)*streamSize, cudaMemcpyHostToDevice, stream[i]);
    if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }

    concurrentKernel<<<1, 512, 0, stream[i]>>>(width, i, deviceArray);


}
cudaDeviceSynchronize();


for (int i = 0; i < numberOfStreams; i++)
{
    offset = i*streamSize;
    mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset);

    cudaMemcpyAsync(returnedArray+offset, deviceArray, sizeof(double)*streamSize, cudaMemcpyDeviceToHost, stream[i]);
    if (cudaError != cudaSuccess) {mexPrintf("returnedArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }

    mexPrintf("returnedArray[offset]: %g, [end]: %g\n",returnedArray[offset/sizeof(double)],returnedArray[(i+1)*streamSize-1]);

    cudaStreamDestroy(stream[i]);    
}

In the attached code you are not waiting for the work to complete before you read the array. Add a cudaDeviceSynchronize after the for loop and move the mexPrintf("returned...) to the cudaStreamDestroy loop. — Greg Smith, Sep 10 '12 at 01:19
@GregSmith, I assume you meant to move the cudaMemcpyAsyn(returnedArray,..) as well as the mexPrint as in the Update. This didn't seem to make any difference? — Beau Bellamy, Sep 10 '12 at 06:48
@BeauBellamy: no that isn't what he meant. I'll give you answer to get you out of trouble if you can wait a minute. — talonmies, Sep 10 '12 at 06:51

score 6 · Answer 1 · answered Sep 10 '12 at 07:10

You need to keep in mind that the APIs you are using with streams are completely asynchronous, so control is returned to the calling host thread immediately. If you don't insert some sort of synchronization point between the GPU running asychronous operations and the host, there is no guarantee that the operations you have enqueued in the streams are actually finished. In your example that means something like this is required:

for (int i = 0; i < numberOfStreams; i++) 
{ 
    offset = i;//*streamSize; 
    mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset); 

    cudaMemcpyAsync(deviceArray, hostArray+offset, sizeof(double)*streamSize, 
                    cudaMemcpyHostToDevice, stream[i]); 

    concurrentKernel<<<1, 512, 0, stream[i]>>>(width, i, deviceArray); 

    cudaMemcpyAsync(returnedArray+offset, deviceArray, sizeof(double)*streamSize,
                    cudaMemcpyDeviceToHost, stream[i]); 
} 

// Host thread waits here until both kernels and copies are finished
cudaDeviceSynchronize();

for (int i = 0; i < numberOfStreams; i++) 
{ 
    mexPrintf("returnedArray[offset]: %g, [end]: %g\n",returnedArray[offset/sizeof(double)],returnedArray[(i+1)*streamSize-1]); 
    cudaStreamDestroy(stream[i]);     
}

The key here is that you need to ensure that both memory transfers have finished before you try inspecting the results in host memory. Neither your original code nor your update does this.

I have made this a community wiki post so that @GregSmith or anyone else can directly edit it as they see fit. — talonmies, Sep 10 '12 at 08:03
Note that the depth-first launch order used in the question can result in false dependencies on devices without Hyper-Q (i.e. all devices up to compute capability 3.5). It's beyond the scope of this question to go into detail here, but a breadth-first launch would avoid the false dependencies. — Tom, Sep 10 '12 at 16:17

score 1 · Answer 2 · answered Sep 10 '12 at 22:48

Also, it looks like you're reusing the deviceArray pointer for the different concurrent streams. Most likely if the current code works as is, it's because of the false dependencies that @Tom mentions causing the hardware to run the streams sequentially. You should really have a separate deviceArray per stream:

/* device memory */
double *deviceArray[numberOfStreams];
for (int i = 0; i < numberOfStreams; i++)
{
    cudaError = cudaMalloc( (void **)&deviceArray[i],sizeof(double)*streamSize);    // size of array for each stream.
    if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }
}

for (int i = 0; i < numberOfStreams; i++)
{
    offset = i;//*streamSize;
    mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset);

    cudaMemcpyAsync(deviceArray[i], hostArray+offset, sizeof(double)*streamSize, cudaMemcpyHostToDevice, stream[i]);
    if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }

    concurrentKernel<<<1, 512, 0, stream[i]>>>(width, i, deviceArray[i]); 

    cudaMemcpyAsync(returnedArray+offset, deviceArray[i], sizeof(double)*streamSize,
                    cudaMemcpyDeviceToHost, stream[i]);     
}

cuda understanding concurrent kernel execution

2 Answers2