I'm using the following the code to learn about how to use "CUDA graphs". The parameter NSTEP
is set as 1000, and the parameter NKERNEL
is set as 20. The kernel function shortKernel
has three parameters, it will perform a simple calculation.
#include <cuda_runtime.h>
#include <iostream>
#define N 131072 // tuned such that kernel takes a few microseconds
#define NSTEP 1000
#define NKERNEL 20
#define BLOCKS 256
#define THREADS 512
#define CHECK(call) \
do { \
const cudaError_t error_code = call; \
if (error_code != cudaSuccess) { \
printf("CUDA Error\n"); \
printf(" File: %s\n", __FILE__); \
printf(" Line: %d\n", __LINE__); \
printf(" Error code: %d\n", error_code); \
printf(" Error text: %s\n", cudaGetErrorString(error_code)); \
exit(1); \
} \
} while (0)
__global__ void shortKernel(float * out_d, float * in_d, int i){
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if(idx<N) out_d[idx]=1.23*in_d[idx] + i;
}
void test2() {
cudaStream_t stream;
cudaStreamCreate(&stream);
cudaSetDevice(0);
float x_host[N], y_host[N];
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x_host[i] = 2.0f;
y_host[i] = 2.0f;
}
float *x, *y, *z;
CHECK(cudaMalloc((void**)&x, N*sizeof(float)));
CHECK(cudaMalloc((void**)&y, N*sizeof(float)));
CHECK(cudaMalloc((void**)&z, N*sizeof(float)));
cudaMemcpy(x, x_host, sizeof(float) * N, cudaMemcpyHostToDevice);
cudaEvent_t begin, end;
CHECK(cudaEventCreate(&begin));
CHECK(cudaEventCreate(&end));
// start recording
cudaEventRecord(begin, stream);
bool graphCreated=false;
cudaGraph_t graph;
cudaGraphExec_t instance;
// Run graphs
for(int istep=0; istep<NSTEP; istep++){
if(!graphCreated){
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
for(int ikrnl=0; ikrnl<NKERNEL; ikrnl++){
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, ikrnl);
}
cudaStreamEndCapture(stream, &graph);
cudaGraphNode_t* nodes = NULL;
size_t num_nodes = 0;
CHECK(cudaGraphGetNodes(graph, nodes, &num_nodes));
std::cout << "Num of nodes in the graph: " << num_nodes
<< std::endl;
CHECK(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
graphCreated=true;
}
CHECK(cudaGraphLaunch(instance, stream));
cudaStreamSynchronize(stream);
} // End run graphs
cudaEventRecord(end, stream);
cudaEventSynchronize(end);
float time_ms = 0;
cudaEventElapsedTime(&time_ms, begin, end);
std::cout << "CUDA Graph - CUDA Kernel overall time: " << time_ms << " ms" << std::endl;
cudaMemcpy(y_host, y, sizeof(float) * N, cudaMemcpyDeviceToHost);
for(int i = 0; i < N; i++) {
std::cout << "res " << y_host[i] << std::endl;
}
// Free memory
cudaFree(x);
cudaFree(y);
}
int main() {
test2();
std::cout << "end" << std::endl;
return 0;
}
My expected results are shown as the following:
res 2.46
res 3.46
res 4.46
res 5.46
res 6.46
...
However, the actual results are shown like this:
res 21.46
res 21.46
res 21.46
res 21.46
res 21.46
res 21.46
...
It seems that the all kernels' parameter i
is set as NKERNEL-1
. I am very confused about it, could someone give any explanations? Thanks!
I had changed the for loop
as follows:
// Run graphs
for(int istep=0; istep<NSTEP; istep++){
if(!graphCreated){
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
for(int ikrnl=0; ikrnl<NKERNEL; ikrnl++){
if(ikrnl == 0)
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, 0);
else if(ikrnl == 1)
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, 1);
else if(ikrnl == 2)
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, 2);
else
shortKernel<<<BLOCKS, THREADS, 0, stream>>>(y, x, ikrnl);
}
cudaStreamEndCapture(stream, &graph);
cudaGraphNode_t* nodes = NULL;
size_t num_nodes = 0;
CHECK(cudaGraphGetNodes(graph, nodes, &num_nodes));
std::cout << "Num of nodes in the graph: " << num_nodes
<< std::endl;
CHECK(cudaGraphInstantiate(&instance, graph, NULL, NULL, 0));
graphCreated=true;
}
CHECK(cudaGraphLaunch(instance, stream));
cudaStreamSynchronize(stream);
} // End run graphs
However, the results are still the same:
res 21.46
res 21.46
res 21.46
res 21.46
res 21.46
res 21.46
...