I have two kernels that process some data sequentially (launched with only one thread). I want to combine the two so that I can have one kernel to launch with two threads. After doing so, I was expecting to get an exec time of max(kernel1, kernel2) but what I got was the sum of the two exec times. I narrowed down the problem to something like the code below.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<iostream>
#include<string>
#include<vector>
#include<random>
#include<functional>
#include<algorithm>
#include<iterator>
__global__ void dummyKernel(const float *d_data_Re, const float *d_data_Im,
float *d_out_Re, float *d_out_Im, const int dataLen) {
int i{ threadIdx.x };
if (i == 0) {
printf("Thread zero started \n");
for (int j{}; j < 1000000; j++)
d_out_Re[j%dataLen] = sqrtf(2) + d_data_Re[j%dataLen] * (j % 4 == 1);
printf("Thread zero finished \n");
}
else if (i == 1) {
printf("Thread one started \n");
for (int j{}; j < 1000000; j++)
d_out_Im[j%dataLen] = sqrtf(2) + d_data_Im[j%dataLen] * (j % 4 == 1);
printf("Thread one finished \n");
}
}
__global__ void dummyKernel2(const float *d_data_Re, const float *d_data_Im,
float *d_out_Re, float *d_out_Im, const int dataLen) {
int i{ threadIdx.x };
//if (i == 0) {
printf("Thread zero started \n");
for (int j{}; j < 1000000; j++)
d_out_Re[j%dataLen] = sqrtf(2) + d_data_Re[j%dataLen] * (j % 4 == 1);
printf("Thread zero finished \n");
//}
//else if (i == 1) {
// printf("Thread one started \n");
// for (int j{}; j < 1000000; j++)
// d_out_Im[j%dataLen] = sqrtf(2) + d_data_Im[j%dataLen] * (j % 4 == 1);
// printf("Thread one finished \n");
//}
}
int main()
{
cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
return 1;
}
const int sizeOfFrame = 2 * 1024 * 1024;
std::vector<float> data_re(sizeOfFrame), data_im;
//random number generator
std::uniform_real_distribution<float> distribution(0.0f, 2.0f); //Values between 0 and 2
std::mt19937 engine; // Mersenne twister MT19937
auto generator = std::bind(distribution, engine);
std::generate_n(data_re.begin(), sizeOfFrame, generator);
std::copy(data_re.begin(), data_re.end(), std::back_inserter(data_im));
//
float *d_data_re, *d_data_im;
cudaMalloc(&d_data_re, sizeOfFrame * sizeof(float));
cudaMalloc(&d_data_im, sizeOfFrame * sizeof(float));
cudaMemcpy(d_data_re, data_re.data(), sizeOfFrame * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_data_im, data_im.data(), sizeOfFrame * sizeof(float), cudaMemcpyHostToDevice);
float *d_pll_out_re, *d_pll_out_im;
cudaMalloc(&d_pll_out_re, sizeOfFrame * sizeof(float));
cudaMalloc(&d_pll_out_im, sizeOfFrame * sizeof(float));
dummyKernel << <1, 2 >> >(d_data_re, d_data_im,
d_pll_out_re, d_pll_out_im, sizeOfFrame);
cudaDeviceSynchronize();
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
btw I got the code for random number generator from an answer to this question. So, the dummyKernel doesn't do anything useful, I just wanted to have a kernel that took relatively long to finish. If you launch dummyKernel, the order of the output will be "Thread zero started", "Thread zero finished", "Thread one started", "Thread one finished". Sequential. But if you launch dummyKernel2, the order of the output will be "Thread zero started", "Thread zero started", "Thread zero finished", "Thread zero finished" and the exec time is almost half as dummyKernel. I don't understand this behavior and the effect of the if-else I used. OS: Windows 10, GTX 1050 Ti, CUDA Driver/Runtime version: 11.1/10.1.