cuda kernels not executing concurrently

Question

I'm trying to explore the concurrent kernels execution property of my Nvidia Quadro 4000, which has 2.0 capability.

I use 2 different streams, which run the same as follows:

Copy H2D two different chunks of pinned memory
Run kernel
Copyt D2H two different chunks to pinned memory.

Kernels of both streams are exactly the same and have 190 ms execution time each.

In the Visual profiler (version 5.0) I expected both kernels to start execution simultaneously, however they overlap only by 20 ms. here is the code sample :

enter code here

//initiate the streams
        cudaStream_t stream0,stream1;
        CHK_ERR(cudaStreamCreate(&stream0));
        CHK_ERR(cudaStreamCreate(&stream1));
        //allocate the memory on the GPU for stream0
        CHK_ERR(cudaMalloc((void **)&def_img0, width*height*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&ref_img0, width*height*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&outY_img0,width_size_for_out*height_size_for_out*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&outX_img0,width_size_for_out*height_size_for_out*sizeof(char)));
        //allocate the memory on the GPU for stream1
        CHK_ERR(cudaMalloc((void **)&def_img1, width*height*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&ref_img1, width*height*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&outY_img1,width_size_for_out*height_size_for_out*sizeof(char)));
        CHK_ERR(cudaMalloc((void **)&outX_img1,width_size_for_out*height_size_for_out*sizeof(char)));

        //allocate page-locked memory for stream0
        CHK_ERR(cudaHostAlloc((void**)&host01, width*height*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host02, width*height*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host03, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host04, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));

        //allocate page-locked memory for stream1
        CHK_ERR(cudaHostAlloc((void**)&host11, width*height*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host12, width*height*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host13, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
        CHK_ERR(cudaHostAlloc((void**)&host14, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));


        memcpy(host01,in1,width*height*sizeof(char));
        memcpy(host02,in2,width*height*sizeof(char));

        memcpy(host11,in1,width*height*sizeof(char));
        memcpy(host12,in2,width*height*sizeof(char));



        cudaEvent_t start, stop;
        float time;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);

        dim3 dimBlock(CUDA_BLOCK_DIM, CUDA_BLOCK_DIM);
        dim3 Grid((width-SEARCH_RADIUS*2-1)/(dimBlock.x*4)+1, (height-SEARCH_RADIUS*2-1)/(dimBlock.y*4)+1);

        cudaEventRecord(start,0);
        // --------------------
        // Copy images to device
        // --------------------
        //enqueue copies of def stream0 and stream1
        CHK_ERR(cudaMemcpyAsync(def_img0, host01,width*height*sizeof(char), cudaMemcpyHostToDevice,stream0));
        CHK_ERR(cudaMemcpyAsync(def_img1, host11,width*height*sizeof(char), cudaMemcpyHostToDevice,stream1));
        //enqueue copies of ref stream0 and stream1
        CHK_ERR(cudaMemcpyAsync(ref_img0, host02,width*height*sizeof(char), cudaMemcpyHostToDevice,stream0));
        CHK_ERR(cudaMemcpyAsync(ref_img1, host12,width*height*sizeof(char), cudaMemcpyHostToDevice,stream1));

        CHK_ERR(cudaStreamSynchronize(stream0));
        CHK_ERR(cudaStreamSynchronize(stream1));

        //CALLING KERNEL
        //enqueue kernel in stream0 and stream1
        TIME_KERNEL((exhaustiveSearchKernel<CUDA_BLOCK_DIM*4,CUDA_BLOCK_DIM*4,SEARCH_RADIUS><<< Grid, dimBlock,0,stream0>>>(def_img0+SEARCH_RADIUS*width+SEARCH_RADIUS,ref_img0,outX_img0,outY_img0,width,width_size_for_out)),"exhaustiveSearchKernel stream0");
        TIME_KERNEL((exhaustiveSearchKernel<CUDA_BLOCK_DIM*4,CUDA_BLOCK_DIM*4,SEARCH_RADIUS><<< Grid, dimBlock,0,stream1>>>(def_img1+SEARCH_RADIUS*width+SEARCH_RADIUS,ref_img1,outX_img1,outY_img1,width,width_size_for_out)),"exhaustiveSearchKernel stream1");


        //Copy result back
        CHK_ERR(cudaMemcpyAsync(host03, outX_img0, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream0));
        CHK_ERR(cudaMemcpyAsync(host13, outX_img1, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream1));

        CHK_ERR(cudaMemcpyAsync(host04, outY_img0, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream0));
        CHK_ERR(cudaMemcpyAsync(host14, outY_img1, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream1));


        CHK_ERR(cudaStreamSynchronize(stream0));
        CHK_ERR(cudaStreamSynchronize(stream1));

        cudaEventRecord( stop, 0 );
        cudaEventSynchronize( stop );
        cudaEventElapsedTime( &time, start, stop );
        printf("Elapsed time=%f ms\n",time);

        memcpy(outX,host03,width_size_for_out*height_size_for_out*sizeof(char));
        memcpy(outY,host04,width_size_for_out*height_size_for_out*sizeof(char));


        cudaEventDestroy( start ); 
        cudaEventDestroy( stop );
        CHK_ERR(cudaStreamDestroy(stream0));
        CHK_ERR(cudaStreamDestroy(stream1));

        CHK_ERR(cudaDeviceReset());


    }

Please provide the API call sequence used as well as the launch configuration of the kernels. The device CC 2.x compute work distributor will distribute all work for the first kernel before distributing work for the second kernel. — Greg Smith, Jan 02 '13 at 17:36
You haven't mentioned the launch configuration of the kernel (block and grid size) and this still lacks an actual question.... — talonmies, Jan 03 '13 at 16:03

score 3 · Answer 1 · answered Jan 11 '13 at 03:55

Compute Capability 2.x-3.0

Compute capability 2.x-3.0 devices have a single hardware work queue. The CUDA driver pushes commands into the work queue. The GPU host reads the commands and dispatches the work to the copy engines or the CUDA Work Distributor (CWD). The CUDA driver inserts synchronization commands into the hardware work queue to guarantee that work on the same stream is not able to run concurrently. When the host hits a synchronization command it will stall until the dependent work is completed.

Concurrent kernel execution improves GPU utilization when a grid is too small to fill the entire GPU or when grids have tail effect (subset of thread blocks execute much longer than other thread blocks).

Case 1: Back to back kernels on one stream

If an application launches two kernesl back to back on the same stream the synchronization command inserted by the CUDA driver will not dispatch the 2nd kernel to CWD until the first kernel has completed.

Case 2: Back to back kernel launches on two streams

If an application launches two kernels on different streams the host will reads the commands and dispatch the commands to CWD. CWD will rasterize the first grid (order is architecture dependent) and dispatch thread blocks to the SMs. Only when all of the threads blocks from the first grid have been dispatched will CWD dispatch thread blocks from the second grid.

Compute Capability 3.5

Compute capability 3.5 introduced several new features to improve GPU utilization. These include: - HyperQ supports multiple independent hardware work queues. - Dynamic Parallelism allows for device code to launch new work. - CWD capacity was increased to 32 grids.

Resources

cuda kernels not executing concurrently

1 Answers1

Linked