I have written a program like this and wanted to display in real-time progress of the kernel. I saw How can I check the progress of matrix multiplication?, but wanted not to use such specific CUDA things as cudaDeviceMapHost. The things do not work even if I allocate array bigger than GPU cache. How is this possible?
#include <chrono>
#include "cuda_runtime.h"
#include "device_atomic_functions.h"
#include "device_launch_parameters.h"
#define CUDA_CHECK(err) __cudaSafeCall(err, __FILE__, __LINE__)
inline void __cudaSafeCall(cudaError err, const char *file, const int line)
{
if (err != cudaSuccess)
{
fprintf(stderr, "%s(%i): CUDA error %d (%s)\n",
file, line, int(err), cudaGetErrorString(err));
throw "CUDA error";
}
}
static const int c_dataSize = 4 << 20;
__global__ void progressKernel(int *devP)
{
while (1)
{
for (int i = 9 + threadIdx.x; i < c_dataSize; i += blockDim.x)
devP[i] = i;
}
}
int main()
{
std::vector<int> data(c_dataSize, 1);
int *devP;
auto startTime = std::chrono::system_clock::now();
cudaStream_t stream2, stream3;
CUDA_CHECK(cudaMalloc((void**)&devP, sizeof(int) * c_dataSize));
CUDA_CHECK(cudaStreamCreate(&stream2));
CUDA_CHECK(cudaGetLastError());
printf("Starting...\n");
progressKernel<<<1, 128, 0, stream2>>>(devP);
printf("Started... %llX\n", (__int64)devP);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaStreamCreate(&stream3));
CUDA_CHECK(cudaMemcpyAsync(&(data[0]), devP, sizeof(int) * c_dataSize,
cudaMemcpyDeviceToHost, stream3));
CUDA_CHECK(cudaStreamSynchronize(stream3));
while (1)
{
auto currentTime = std::chrono::system_clock::now();
auto transformed = std::chrono::duration_cast<std::chrono::nanoseconds>(currentTime - startTime).count();
printf("%.7f ms: %d, %d, %d\n", (double)transformed / 1000000,
data[10], data[100], data[c_dataSize - 1]);
cudaMemcpyAsync(&(data[0]), devP, sizeof(int) * c_dataSize,
cudaMemcpyDeviceToHost, stream3);
cudaStreamSynchronize(stream3);
}
printf("Done\n");
}
I am getting
Starting...
Started... 505200000
1520.5665000 ms: 0, 0, 0
1526.6487000 ms: 0, 0, 0
1530.3077000 ms: 0, 0, 0
1534.4480000 ms: 0, 0, 0
1538.1516000 ms: 0, 0, 0
1541.7932000 ms: 0, 0, 0
1545.4041000 ms: 0, 0, 0
1549.6127000 ms: 0, 0, 0
1553.5760000 ms: 0, 0, 0
1557.2292000 ms: 0, 0, 0
1560.8776000 ms: 0, 0, 0
1564.6736000 ms: 0, 0, 0
1568.8331000 ms: 0, 0, 0
1572.5332000 ms: 0, 0, 0 ...
Windows 10 x64 Pro 19044.2251, Visual Studio 2019 16.0.2, CUDA Toolkit 10.2