cudaStream strange performance

Question

I try to develop an example of sobel with cudaStream. Here is the program:

void SobelStream(void)
{

    cv::Mat imageGrayL2 = cv::imread("/home/xavier/Bureau/Image1.png",0);


    u_int8_t *u8_PtImageHost;
    u_int8_t *u8_PtImageDevice;

    u_int8_t *u8_ptDataOutHost;
    u_int8_t *u8_ptDataOutDevice;

    u_int8_t u8_Used[NB_STREAM];

    u8_ptDataOutHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));
    checkCudaErrors(cudaMalloc((void**)&u8_ptDataOutDevice,WIDTH*HEIGHT*sizeof(u_int8_t)));

    u8_PtImageHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));
    checkCudaErrors(cudaMalloc((void**)&u8_PtImageDevice,WIDTH*HEIGHT*sizeof(u_int8_t)));


    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned char>();
    checkCudaErrors(cudaMallocArray(&Array_PatchsMaxDevice, &channelDesc,WIDTH,HEIGHT ));
    checkCudaErrors(cudaBindTextureToArray(Image,Array_PatchsMaxDevice));


    dim3 threads(BLOC_X,BLOC_Y);
    dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)HEIGHT/BLOC_Y));

    ClearKernel<<<blocks,threads>>>(u8_ptDataOutDevice,WIDTH,HEIGHT);


    int blockh = HEIGHT/NB_STREAM;


    Stream = (cudaStream_t *) malloc(NB_STREAM * sizeof(cudaStream_t));

    for (int i = 0; i < NB_STREAM; i++)
    {
        checkCudaErrors(cudaStreamCreate(&(Stream[i])));
    }

//    for(int i=0;i<NB_STREAM;i++)
//    {
//        cudaSetDevice(0);
//        cudaStreamCreate(&Stream[i]);
//    }


    cudaEvent_t Start;
    cudaEvent_t Stop;
    cudaEventCreate(&Start);
    cudaEventCreate(&Stop);

    cudaEventRecord(Start, 0);


    //////////////////////////////////////////////////////////
    for(int i=0;i<NB_STREAM;i++)
    {
        if(i == 0)
        {
            int localHeight  = blockh;
            checkCudaErrors(cudaMemcpy2DToArrayAsync( Array_PatchsMaxDevice,
                                                      0,
                                                      0,
                                                      imageGrayL2.data,//u8_PtImageDevice,
                                                      WIDTH,
                                                      WIDTH,
                                                      blockh,
                                                      cudaMemcpyHostToDevice  ,
                                                      Stream[i]));

            dim3 threads(BLOC_X,BLOC_Y);
            dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)localHeight/BLOC_Y));
            SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,0,WIDTH,localHeight-1);
            checkCudaErrors(cudaGetLastError());

            u8_Used[i] = 1;

        }else{


            int ioffsetImage =  WIDTH*(HEIGHT/NB_STREAM  );
            int hoffset = HEIGHT/NB_STREAM *i;
            int hoffsetkernel = HEIGHT/NB_STREAM -1 + HEIGHT/NB_STREAM* (i-1);
            int localHeight  = min(HEIGHT - (blockh*i),blockh);

            //printf("hoffset: %d hoffsetkernel %d localHeight %d rest %d ioffsetImage %d \n",hoffset,hoffsetkernel,localHeight,HEIGHT - (blockh +1 +blockh*(i-1)),ioffsetImage*i/WIDTH);

            checkCudaErrors(cudaMemcpy2DToArrayAsync( Array_PatchsMaxDevice,
                                                      0,
                                                      hoffset,
                                                      &imageGrayL2.data[ioffsetImage*i],//&u8_PtImageDevice[ioffset*i],
                            WIDTH,
                            WIDTH,
                            localHeight,
                            cudaMemcpyHostToDevice  ,
                            Stream[i]));


            u8_Used[i] = 1;
            if(HEIGHT - (blockh +1 +blockh*(i-1))<=0)
            {
                break;
            }
        }
    }



    ///////////////////////////////////////////
    for(int i=0;i<NB_STREAM;i++)
    {
        if(i == 0)
        {
            int localHeight  = blockh;


            dim3 threads(BLOC_X,BLOC_Y);
            dim3 blocks(1,1);
            SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,0,WIDTH,localHeight-1);
            checkCudaErrors(cudaGetLastError());

            u8_Used[i] = 1;

        }else{


            int ioffsetImage =  WIDTH*(HEIGHT/NB_STREAM  );
            int hoffset = HEIGHT/NB_STREAM *i;
            int hoffsetkernel = HEIGHT/NB_STREAM -1 + HEIGHT/NB_STREAM* (i-1);
            int localHeight  = min(HEIGHT - (blockh*i),blockh);


            dim3 threads(BLOC_X,BLOC_Y);
            dim3 blocks(1,1);

            SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,hoffsetkernel,WIDTH,localHeight);
            checkCudaErrors(cudaGetLastError());

            u8_Used[i] = 1;
            if(HEIGHT - (blockh +1 +blockh*(i-1))<=0)
            {
                break;
            }
        }
    }


    ///////////////////////////////////////////////////////
    for(int i=0;i<NB_STREAM;i++)
    {
        if(i == 0)
        {
            int localHeight  = blockh;
            checkCudaErrors(cudaMemcpyAsync(u8_ptDataOutHost,u8_ptDataOutDevice,WIDTH*(localHeight-1)*sizeof(u_int8_t),cudaMemcpyDeviceToHost,Stream[i]));
            u8_Used[i] = 1;

        }else{

            int ioffsetImage =  WIDTH*(HEIGHT/NB_STREAM  );
            int hoffset = HEIGHT/NB_STREAM *i;
            int hoffsetkernel = HEIGHT/NB_STREAM -1 + HEIGHT/NB_STREAM* (i-1);
            int localHeight  = min(HEIGHT - (blockh*i),blockh);

            checkCudaErrors(cudaMemcpyAsync(&u8_ptDataOutHost[hoffsetkernel*WIDTH],&u8_ptDataOutDevice[hoffsetkernel*WIDTH],WIDTH*localHeight*sizeof(u_int8_t),cudaMemcpyDeviceToHost,Stream[i]));

            u8_Used[i] = 1;
            if(HEIGHT - (blockh +1 +blockh*(i-1))<=0)
            {
                break;
            }
        }
    }


    for(int i=0;i<NB_STREAM;i++)
    {
        cudaStreamSynchronize(Stream[i]);
    }

    cudaEventRecord(Stop, 0);

    cudaEventSynchronize(Start);
    cudaEventSynchronize(Stop);


    float dt_ms;
    cudaEventElapsedTime(&dt_ms, Start, Stop);

    printf("dt_ms %f \n",dt_ms);

}

I had a really strange performance on th execution of my program. I decided to profile my example and I get that:

I don't understand it seems that each stream are waiting each other. Can someone help me about that?

Where is the question? What is unexpected? – kangshiyin Jun 15 '16 at 14:36 — kangshiyin, Jun 15 '16 at 14:36

score 3 · Accepted Answer · answered Jun 15 '16 at 22:44

First of all, in the future, please provide a complete code. I'm also working off of your cross-posting here to fill in some details such as kernel sizes.

You have two issues to address:

First, any time you wish to use cudaMemcpyAsync, you will most likely want to be working with pinned host allocations. If you use allocations created e.g. with malloc, you will not get the expected behavior from cudaMemcpyAsync as far as asynchronous concurrent execution is concerned. This necessity is covered in the programming guide:

If host memory is involved in the copy, it must be page-locked.

So the first change to make to your code is to convert this:

u8_PtImageHost   = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));
u8_ptDataOutHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));

to this:

checkCudaErrors(cudaHostAlloc(&u8_PtImageHost, WIDTH*HEIGHT*sizeof(u_int8_t), cudaHostAllocDefault));
checkCudaErrors(cudaHostAlloc(&u8_ptDataOutHost, WIDTH*HEIGHT*sizeof(u_int8_t), cudaHostAllocDefault));

with that change alone, your execution duration drops from about 21ms to 7ms according to my testing. The reason for this is that without the change, we get no overlap whatsoever:

With the change, the copy activity can overlap with each other (H->D and D->H) and with kernel execution:

The second issue you face to get to concurrent kernel execution is that your kernels are just too large (too many blocks/threads):

#define WIDTH   6400
#define HEIGHT  4800
#define NB_STREAM 10

#define BLOC_X 32
#define BLOC_Y 32

    dim3 threads(BLOC_X,BLOC_Y);
    dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)HEIGHT/BLOC_Y));

I would suggest that if these are the sizes of kernels you need to run, there's probably not much benefit to try and strive for kernel overlap - each kernel is launching enough blocks to "fill" the GPU, so you have already exposed enough parallelism to keep the GPU busy. But if you are desperate to witness kernel concurrency, you could make your kernels use a smaller number of blocks while causing each kernel to spend more time executing. We could do this by launching 1 block, and have just the the threads in each block perform the image filtering.

cudaStream strange performance

1 Answers1

Linked