-1

I have written a program like this and wanted to display in real-time progress of the kernel. I saw How can I check the progress of matrix multiplication?, but wanted not to use such specific CUDA things as cudaDeviceMapHost. The things do not work even if I allocate array bigger than GPU cache. How is this possible?

#include <chrono>
#include "cuda_runtime.h"
#include "device_atomic_functions.h"
#include "device_launch_parameters.h"

#define CUDA_CHECK(err)     __cudaSafeCall(err, __FILE__, __LINE__)

inline void __cudaSafeCall(cudaError err, const char *file, const int line)
{
    if (err != cudaSuccess)
    {
        fprintf(stderr, "%s(%i): CUDA error %d (%s)\n",
                file, line, int(err), cudaGetErrorString(err));
        throw "CUDA error";
    }
}

static const int c_dataSize = 4 << 20;

__global__ void progressKernel(int *devP)
{
    while (1)
    {
        for (int i = 9 + threadIdx.x; i < c_dataSize; i += blockDim.x)
            devP[i] = i;
    }
}

int main()
{
    std::vector<int> data(c_dataSize, 1);
    int *devP;
    auto startTime = std::chrono::system_clock::now();
    cudaStream_t stream2, stream3;

    CUDA_CHECK(cudaMalloc((void**)&devP, sizeof(int) * c_dataSize));
    CUDA_CHECK(cudaStreamCreate(&stream2));
    CUDA_CHECK(cudaGetLastError());

    printf("Starting...\n");
    progressKernel<<<1, 128, 0, stream2>>>(devP);
    printf("Started... %llX\n", (__int64)devP);
    CUDA_CHECK(cudaGetLastError());

    CUDA_CHECK(cudaStreamCreate(&stream3));
    CUDA_CHECK(cudaMemcpyAsync(&(data[0]), devP, sizeof(int) * c_dataSize,
            cudaMemcpyDeviceToHost, stream3));
    CUDA_CHECK(cudaStreamSynchronize(stream3));
    while (1) 
    {
        auto currentTime = std::chrono::system_clock::now();
        auto transformed = std::chrono::duration_cast<std::chrono::nanoseconds>(currentTime - startTime).count();

        printf("%.7f ms: %d, %d, %d\n", (double)transformed / 1000000,
                data[10], data[100], data[c_dataSize - 1]);
        cudaMemcpyAsync(&(data[0]), devP, sizeof(int) * c_dataSize,
                cudaMemcpyDeviceToHost, stream3);
        cudaStreamSynchronize(stream3);
    }
    printf("Done\n");
}

I am getting

Starting...
Started... 505200000
1520.5665000 ms: 0, 0, 0
1526.6487000 ms: 0, 0, 0
1530.3077000 ms: 0, 0, 0
1534.4480000 ms: 0, 0, 0
1538.1516000 ms: 0, 0, 0
1541.7932000 ms: 0, 0, 0
1545.4041000 ms: 0, 0, 0
1549.6127000 ms: 0, 0, 0
1553.5760000 ms: 0, 0, 0
1557.2292000 ms: 0, 0, 0
1560.8776000 ms: 0, 0, 0
1564.6736000 ms: 0, 0, 0
1568.8331000 ms: 0, 0, 0
1572.5332000 ms: 0, 0, 0 ...

Windows 10 x64 Pro 19044.2251, Visual Studio 2019 16.0.2, CUDA Toolkit 10.2

Mikhail M
  • 929
  • 2
  • 10
  • 23
  • I suggest you check for errors in the loop too. Your error checking does not syncronize, so if something has gone bad after the start of each call, you may not be catching it. – Ander Biguri Nov 18 '22 at 10:34
  • After fixing compilation errors, your code works fine on linux with cuda 11.6. `2670.9881320 ms: 10, 100, 4194303` – Abator Abetor Nov 18 '22 at 19:37
  • @AbatorAbetor, thanks! My experiments with code from mentioned How I can check... question shows that it is also fragile - some changes like replacement of "for" with "while" can break it – Mikhail M Nov 19 '22 at 10:22

0 Answers0