My program is a pipeline, which contains multiple kernels and memcpys. Each task will go through the same pipeline with different input data. The host code will first chooses a Channel, an encapsulation of scratchpad memory and CUDA objects, when it process a task. And after the last stage, I will record an event then will go to process next task.
The main pipeline logic is in the following. The problem is that operations in different streams are not overlapping. I attached the timeline of processing 10 tasks. You can see none operations in streams are overlapped. For each kernel, there is 256 threads in a block and 5 blocks in a grid.
All buffers used for memcpy are pinned, I am sure that I have meet those requirements for overlapping kernel execution and data transfers. Can someone help me figure out the reason? Thanks.
Environment information
GPU: Tesla K40m (GK110)
Max Warps/SM: 64
Max Thread Blocks/SM: 16
Max Threads/SM: 2048
CUDA version: 8.0
void execute_task_pipeline(int stage, MyTask *task, Channel *channel) {
assert(channel->taken);
assert(!task->finish());
GPUParam *para = &channel->para;
assert(para->col_num > 0);
assert(para->row_num > 0);
// copy vid_list to device
CUDA_ASSERT( cudaMemcpyAsync(para->vid_list_d, task->vid_list.data(),
sizeof(uint) * para->row_num, cudaMemcpyHostToDevice, channel->stream) );
k_get_slot_id_list<<<WK_GET_BLOCKS(para->row_num),
WK_CUDA_NUM_THREADS, 0, channel->stream>>>(
vertices_d,
para->vid_list_d,
para->slot_id_list_d,
config.num_buckets,
para->row_num);
k_get_edge_list<<<WK_GET_BLOCKS(para->row_num),
WK_CUDA_NUM_THREADS, 0, channel->stream>>>(
vertices_d,
para->slot_id_list_d,
para->edge_size_list_d,
para->offset_list_d,
para->row_num);
k_calc_prefix_sum(para, channel->stream);
k_update_result_table_k2u<<<WK_GET_BLOCKS(para->row_num),
WK_CUDA_NUM_THREADS, 0, channel->stream>>>(
edges_d,
para->vid_list_d,
para->updated_result_table_d,
para->prefix_sum_list_d,
para->offset_list_d,
para->col_num,
para->row_num);
para->col_num += 1;
// copy result back to host
CUDA_ASSERT( cudaMemcpyAsync(&(channel->num_new_rows), para->prefix_sum_list_d + para->row_num - 1,
sizeof(uint), cudaMemcpyDeviceToHost, channel->stream) );
// copy result to host memory
CUDA_ASSERT( cudaMemcpyAsync(channel->h_buf, para->updated_result_table_d,
channel->num_new_rows * (para->col_num + 1), cudaMemcpyDeviceToHost, channel->stream) );
// insert a finish event in the end of pipeline
CUDA_ASSERT( cudaEventRecord(channel->fin_event, channel->stream) );
}