0

This program:

#include <string>
#include <stdexcept>

struct buffers_t {
    void* host_buffer;
    void* device_buffer;
};

void ensure_no_error(std::string message) {
    auto status = cudaGetLastError();
    if (status != cudaSuccess) {
        throw std::runtime_error(message + ": " + cudaGetErrorString(status));
    }
}

void my_callback(cudaStream_t stream, cudaError_t status, void* args) {
    auto buffers = (buffers_t *) args;
    cudaMemcpyAsync(
        buffers->host_buffer, buffers->device_buffer,
        1, cudaMemcpyDefault, stream);
    ensure_no_error("after cudaMemcpyAsync");
}

int main() {
    cudaStream_t stream;
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    buffers_t buffers;
    cudaMallocHost(&buffers.host_buffer, 1);
    cudaMalloc(&buffers.device_buffer, 1);
    cudaStreamAddCallback(stream, my_callback, &buffers, 0);
    ensure_no_error("after enqueue callback");
    cudaStreamSynchronize(stream);
    ensure_no_error("after sync");
}

yields:

terminate called after throwing an instance of 'std::runtime_error'
  what():  after cudaMemcpyAsync: operation not permitted
Aborted

That's kind of weird, because the API reference for cudaMemcpyAsync does not list cudaErrorNotPermitted as one of the potential errors. Is there really a problem with scheduling an async copy from a callback?

Note: My machine has a GTX 650 Ti (CC 3.0), CUDA 9.0, Linux kernel 4.8.0, driver 384.59.

einpoklum
  • 118,144
  • 57
  • 340
  • 684

1 Answers1

2

Is there really a problem with scheduling an async copy from a callback?

From the documentation on stream callbacks:

A callback must not make CUDA API calls (directly or indirectly), as it might end up waiting on itself if it makes such a call leading to a deadlock.

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
  • Thanks, sorry, I missed that. Actually that doesn't make a lot of sense if the call is guaranteed to by asynchronous, but ok. – einpoklum Nov 01 '17 at 09:31