1

Consider this example where I wait for completion with gpuQueue.wait():

  constexpr unsigned dataSize = 1024;
  
  std::vector<float> in1 (dataSize, 0); std::iota(in1.begin(), in1.end(), 0 );
  std::vector<float> in2 (dataSize, 0); std::iota(in2.begin(), in2.end(), 25);
  std::vector<float> out (dataSize, 0);

  float coeff1 = 4;

  cl::sycl::buffer<float, 1> bufIn1(in1.data(), dataSize);
  cl::sycl::buffer<float, 1> bufIn2(in2.data(), dataSize);
  cl::sycl::buffer<float, 1> bufOut(out.data(), dataSize);

  cl::sycl::queue gpuQueue{cl::sycl::gpu_selector()};
  gpuQueue.submit([&](cl::sycl::handler &cgh) {
    auto in1  = bufIn1.get_access<cl::sycl::access::mode::read> (cgh);
    auto in2  = bufIn2.get_access<cl::sycl::access::mode::read> (cgh);
    auto temp = bufOut.get_access<cl::sycl::access::mode::write>(cgh);

    auto kernel = [=](cl::sycl::id<1> id) {
      temp[id] = in1[id] * in2[id] + coeff1;
    };
    cgh.parallel_for<floatIteration>(cl::sycl::range<1>(dataSize), kernel);

  });

  gpuQueue.wait();
  
  for( unsigned i = 0; i < dataSize; ++i )
    std::cout << out[i] <<  "  " << (in1[i] * in2[i] + coeff1) << "\n";

And this example where the buffers are in a nested scope and there is no wait:

  constexpr unsigned dataSize = 1024;
  
  std::vector<float> in1 (dataSize, 0); std::iota(in1.begin(), in1.end(), 0 );
  std::vector<float> in2 (dataSize, 0); std::iota(in2.begin(), in2.end(), 25);
  std::vector<float> out (dataSize, 0);

  float coeff1 = 4;
  {
    cl::sycl::buffer<float, 1> bufIn1(in1.data(), dataSize);
    cl::sycl::buffer<float, 1> bufIn2(in2.data(), dataSize);
    cl::sycl::buffer<float, 1> bufOut(out.data(), dataSize);

    cl::sycl::queue gpuQueue{cl::sycl::gpu_selector()};
    gpuQueue.submit([&](cl::sycl::handler &cgh) {
        auto in1 = bufIn1.get_access<cl::sycl::access::mode::read>(cgh);
        auto in2 = bufIn2.get_access<cl::sycl::access::mode::read>(cgh);
        auto temp = bufOut.get_access<cl::sycl::access::mode::write>(cgh);

        auto kernel = [=](cl::sycl::id<1> id) {
            temp[id] = in1[id] * in2[id] + coeff1;
        };
        cgh.parallel_for<floatIteration>(cl::sycl::range<1>(dataSize), kernel);

    });

  }
  for( unsigned i = 0; i < dataSize; ++i )
    std::cout << out[i] <<  "  " << (in1[i] * in2[i] + coeff1) << "\n";

The output of both is:

queue.wait() nested scope
0 4 4 4
0 30 30 30
0 58 58 58

Why doesn't queue.wait() wait on the copying of the data in the buffers back to the host?

iwans
  • 445
  • 3
  • 13

1 Answers1

3

The queue.wait() waits for all queue operations to complete, in this case, the execution of the command group with the floatIteration kernel. However, what you really need is to see the side-effects of the operation in the host side. In SYCL, when you write vía a buffer object, you are not using necessarily the same host memory you used as input (e.g. in1.data()). The data is only guaranteed to be updated in the host when the buffer scope ends, or when there is a command group that requires the side effects on the host to be visible (e.g., a explicit copy back to the host).

The second code is correct, you only read data in the host after copies are completed. The first one would need an explicit copy operation before the wait to be correct.

Ruyk
  • 775
  • 5
  • 11