How to asynchronously copy memory from the host to the device using thrust and CUDA streams

Question

I would like to copy memory from the host to the device using thrust as in

thrust::host_vector<float> h_vec(1 << 28);
thrust::device_vector<float> d_vec(1 << 28);
thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());

using CUDA streams analogously to how you would copy memory from the device to the device using streams:

cudaStream_t s;
cudaStreamCreate(&s);

thrust::device_vector<float> d_vec1(1 << 28), d_vec2(1 << 28);
thrust::copy(thrust::cuda::par.on(s), d_vec1.begin(), d_vec1.end(), d_vec2.begin());

cudaStreamSynchronize(s);
cudaStreamDestroy(s);

The problem is that I can't set the execution policy to CUDA to specify the stream when copying from the host to the device, because, in that case, thrust would assume that both vectors are stored on the device. Is there a way to get around this problem? I'm using the latest thrust version from github (it says 1.8 in the version.h file).

The [announcement I read](https://groups.google.com/forum/#!topic/thrust-users/i6pxQ9jPZFU) made it sound like streams were implemented for underlying *kernel calls*, not necessarily across-the-board in thrust. If you did use streams for copying from host vector to device vector, it's likely that you would want to use a [pinned allocator](http://thrust.github.io/doc/classthrust_1_1system_1_1cuda_1_1experimental_1_1pinned__allocator.html) on the host. I believe, therefore, that what you're asking for could be accomplished with thrust vectors and `cudaMemcpyAsync`. — Robert Crovella, Jul 31 '14 at 17:02
Yes, you should use `cudaMemcpyAsync` for this directly as Robert suggests. — Jared Hoberock, Jul 31 '14 at 17:26
Af of today (May 2016), I find the first entry in the documentation here: http://thrust.github.io/doc/group__copying.html#ga3e43fb8472db501412452fa27b931ee2 really disturbing. It says that we can write thrust::copy( thrust::cuda::par.on(cudaStream), HostPtr, HostPtr+size, DevicePtr ); as a valid syntax, without getting your copy issued asynchronously to the particular stream you pass in parameter... — Tobbey, May 20 '16 at 16:32

Robert Crovella · Accepted Answer · 2014-08-01T03:03:50.000

As indicated in the comments, I don't think this will be possible directly with thrust::copy. However we can use cudaMemcpyAsync in a thrust application to achieve the goal of asynchronous copies and overlap of copy with compute.

Here is a worked example:

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>
#include <thrust/system/cuda/execution_policy.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <iostream>

// DSIZE determines duration of H2D and D2H transfers
#define DSIZE (1048576*8)
// SSIZE,LSIZE determine duration of kernel launched by thrust
#define SSIZE (1024*512)
#define LSIZE 1
// KSIZE determines size of thrust kernels (number of threads per block)
#define KSIZE 64
#define TV1 1
#define TV2 2

typedef int mytype;
typedef thrust::host_vector<mytype, thrust::cuda::experimental::pinned_allocator<mytype> > pinnedVector;

struct sum_functor
{
  mytype *dptr;
  sum_functor(mytype* _dptr) : dptr(_dptr) {};
  __host__ __device__ void operator()(mytype &data) const
    {
      mytype result = data;
      for (int j = 0; j < LSIZE; j++)
        for (int i = 0; i < SSIZE; i++)
          result += dptr[i];
      data = result;
    }
};

int main(){

  pinnedVector hi1(DSIZE);
  pinnedVector hi2(DSIZE);
  pinnedVector ho1(DSIZE);
  pinnedVector ho2(DSIZE);
  thrust::device_vector<mytype> di1(DSIZE);
  thrust::device_vector<mytype> di2(DSIZE);
  thrust::device_vector<mytype> do1(DSIZE);
  thrust::device_vector<mytype> do2(DSIZE);
  thrust::device_vector<mytype> dc1(KSIZE);
  thrust::device_vector<mytype> dc2(KSIZE);

  thrust::fill(hi1.begin(), hi1.end(),  TV1);
  thrust::fill(hi2.begin(), hi2.end(),  TV2);
  thrust::sequence(do1.begin(), do1.end());
  thrust::sequence(do2.begin(), do2.end());

  cudaStream_t s1, s2;
  cudaStreamCreate(&s1); cudaStreamCreate(&s2);

  cudaMemcpyAsync(thrust::raw_pointer_cast(di1.data()), thrust::raw_pointer_cast(hi1.data()), di1.size()*sizeof(mytype), cudaMemcpyHostToDevice, s1);
  cudaMemcpyAsync(thrust::raw_pointer_cast(di2.data()), thrust::raw_pointer_cast(hi2.data()), di2.size()*sizeof(mytype), cudaMemcpyHostToDevice, s2);

  thrust::for_each(thrust::cuda::par.on(s1), do1.begin(), do1.begin()+KSIZE, sum_functor(thrust::raw_pointer_cast(di1.data())));
  thrust::for_each(thrust::cuda::par.on(s2), do2.begin(), do2.begin()+KSIZE, sum_functor(thrust::raw_pointer_cast(di2.data())));

  cudaMemcpyAsync(thrust::raw_pointer_cast(ho1.data()), thrust::raw_pointer_cast(do1.data()), do1.size()*sizeof(mytype), cudaMemcpyDeviceToHost, s1);
  cudaMemcpyAsync(thrust::raw_pointer_cast(ho2.data()), thrust::raw_pointer_cast(do2.data()), do2.size()*sizeof(mytype), cudaMemcpyDeviceToHost, s2);

  cudaDeviceSynchronize();
  for (int i=0; i < KSIZE; i++){
    if (ho1[i] != ((LSIZE*SSIZE*TV1) + i)) { std::cout << "mismatch on stream 1 at " << i << " was: " << ho1[i] << " should be: " << ((DSIZE*TV1)+i) << std::endl; return 1;}
    if (ho2[i] != ((LSIZE*SSIZE*TV2) + i)) { std::cout << "mismatch on stream 2 at " << i << " was: " << ho2[i] << " should be: " << ((DSIZE*TV2)+i) << std::endl; return 1;}
    }
  std::cout << "Success!" << std::endl;
  return 0;
}

For my test case, I used RHEL5.5, Quadro5000, and cuda 6.5RC. This example is designed to have thrust create very small kernels (only a single threadblock, as long as KSIZE is small, say 32 or 64), so that the kernels that thrust creates from thrust::for_each are able to run concurrently.

When I profile this code, I see:

nvvp output for thrust streams application

This indicates that we are achieving proper overlap both between thrust kernels, and between copy operations and thrust kernels, as well as asynchronous data copying at the completion of the kernels. Note that the cudaDeviceSynchronize() operation "fills" the timeline, indicating that all the async operations (data copying, thrust functions) were issued asynchronously and control returned to the host thread before any of the operations were underway. All of this is expected, proper behavior for full concurrency between host, GPU, and data copying operations.

is the use pf `pinned_allocator` necessary to make `cudaMemcpyAsync` work correctly on a `thrust::host_vector`? What would happen if I used a standard `thrust::host_vector`? — m.s., Jun 13 '15 at 11:46
Standard `host_vector` uses an unpinned (i.e. not page-locked) allocator. That means that when you try to do a `cudaMemcpyAsync`, the operation will not be asynchronous. And if you are attempting to overlap that operation with something else, it will not overlap. Take a look [here](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#page-locked-host-memory) and [here](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#asynchronous-concurrent-execution) — Robert Crovella, Jun 13 '15 at 13:55
Note that the version of thrust that shipped with CUDA 7 has an [issue](https://github.com/thrust/thrust/issues/664) that prevents proper issuing of thrust kernels to streams, in some cases. The workaround would be to 1. update the thrust on CUDA 7 to the [current development version](https://github.com/thrust/thrust) (which includes the fix for the issue), or 2. revert to CUDA 6.5 (or advance to some future CUDA toolkit version, when it becomes available.) — Robert Crovella, Jun 13 '15 at 14:48
Yes, the thrust version that ships with CUDA 7.5 picked up the fix for this issue. — Robert Crovella, Dec 17 '15 at 13:14

score 2 · Answer 2 · answered Oct 26 '16 at 14:46

Here's a worked example using thrust::cuda::experimental::pinned_allocator<T>:

// Compile with:
// nvcc --std=c++11 mem_async.cu -o mem_async

#include <cuda.h>
#include <cuda_runtime.h>
#include <cufft.h>

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/fill.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>

#define LEN 1024

int main(int argc, char *argv[]) {
    thrust::host_vector<float, thrust::cuda::experimental::pinned_allocator<float>> h_vec(LEN);
    thrust::device_vector<float> d_vec(LEN);

    thrust::fill(d_vec.begin(), d_vec.end(), -1.0);

    cudaMemcpyAsync(thrust::raw_pointer_cast(h_vec.data()),
                    thrust::raw_pointer_cast(d_vec.data()),
                    d_vec.size()*sizeof(float),
                    cudaMemcpyDeviceToHost);

    // Comment out this line to see what happens.
    cudaDeviceSynchronize();

    std::cout << h_vec[0] << std::endl;
}

Comment out the synchronize step and you should get 0 printed to the console due to the async memory transfer.

How to asynchronously copy memory from the host to the device using thrust and CUDA streams

2 Answers2

Linked