0

Given the following program using OpenCL 1.2

#include <CL/cl.h>
#include <thread>

int main()
{
    std::this_thread::sleep_for(std::chrono::seconds{4});

    cl_platform_id plats[10];
    cl_uint count = 0;
    if (clGetPlatformIDs(10, plats, &count) != CL_SUCCESS)
        std::terminate();

    cl_device_id dev = nullptr;
    for (cl_uint i = 0; i < count && !dev; ++i) {

        cl_device_id devs[10];
        cl_uint dev_count = 0;
        if (clGetDeviceIDs(plats[i], CL_DEVICE_TYPE_GPU, 10, devs, &dev_count) != CL_SUCCESS)
            continue;

        for (cl_uint j = 0; j < dev_count && !dev; ++j) {
            cl_bool avail = false;
            if (clGetDeviceInfo(devs[j], CL_DEVICE_AVAILABLE, sizeof(avail), &avail, nullptr) != CL_SUCCESS)
                continue;

            if (avail)
                dev = devs[j];
        }
    }

    cl_int res = CL_SUCCESS;
    cl_context c = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &res);
    if (res != CL_SUCCESS)
        std::terminate();

    cl_command_queue cq = clCreateCommandQueueWithProperties(c, dev, nullptr, &res);
    if (res != CL_SUCCESS)
        std::terminate();

    cl_mem buf = clCreateBuffer(c, CL_MEM_READ_WRITE, 4096*4096, nullptr, &res);
    if (res != CL_SUCCESS)
        std::terminate();

    while (true) {

        static uint i = 0;
        constexpr uint num = 410;
        i = (i + num) % (4096 - num);

        void* p = clEnqueueMapBuffer(cq, buf, CL_TRUE, CL_MAP_READ, i*4096, num*4096, 0, nullptr, nullptr, &res);
        if (res != CL_SUCCESS)
            std::terminate();

        if (clEnqueueUnmapMemObject(cq, buf, p, 0, nullptr, nullptr) != CL_SUCCESS)
            std::terminate();

        std::this_thread::sleep_for(std::chrono::milliseconds{50});
    }
}

I find that in the while loop the memory usage gradually increases for each iteration from 1% until it reaches approximately 24% of my 16 GB RAM. The end point varies depending on what I set num to be. This behavior is completely removed if I use CL_MEM_USE_HOST_PTR and gives clCreateBuffer a pointer. I have tested on a range of different devices and find that it has also disappeared when using Intel instead of NVidia or AMD GPU. It is not a real memory leak as the memory is released when calling clRelease* on the created OpenCL objects.

Is this a result of a misunderstanding of the API by me or is this a bug?

Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
Mestkon
  • 3,532
  • 7
  • 18
  • 1
    `CL_MEM_USE_HOST_PTR` by [docs](https://www.khronos.org/registry/OpenCL/sdk/2.2/docs/man/html/clCreateBuffer.html): _If specified, it indicates that the application wants the OpenCL implementation to use memory referenced by host_ptr as the storage bits for the memory object._ I suspect Intel was integrated GPU but NVidia and AMD was discrete GPU. – doqtor Jun 26 '20 at 14:52
  • 2
    That is correct. I suspect that it does not happen with `CL_MEM_USE_HOST_PTR` because `clEnqueueMapBuffer` just synchronizes the data and returns a pointer to the host memory. When not using `CL_MEM_USE_HOST_PTR` then `clEnqueueMapBuffer` has to allocate memory, if the device memory is not the system memory, which is what I observe. – Mestkon Jun 26 '20 at 15:07

0 Answers0