In the process of learning SYCL/DPC++, I wrote a SYCL GPU-enabled dot product code (full code on GitHub).
#include <iostream>
#include <sstream>
#include <cmath>
#include <CL/sycl.hpp>
int main (int argc, char* argv[]) {
unsigned long N;
/* snip */
// Select GPU device and create queue
sycl::device mygpu { sycl::gpu_selector_v };
sycl::queue q(mygpu);
/* snip */
// Allocate vectors on GPU and attach buffers
double* vecA = sycl::malloc_device<double>(N, q);
double* vecB = sycl::malloc_device<double>(N, q);
// Initialize vectors on GPU using SYCL lambda kernels
q.submit( [&](sycl::handler& h) {
h.parallel_for(sycl::range<1>(N), [=](sycl::id<1> i) {
vecA[i] = (double)i;
});
});
q.submit( [&](sycl::handler& h) {
h.parallel_for(sycl::range<1>(N), [=](sycl::id<1> i) {
vecB[i] = 2.0 * (double)i;
});
});
// Allocate result as shared memory and attach buffer
double* result = sycl::malloc_shared<double>(1, q);
sycl::buffer bufres = sycl::buffer<double>(result, 1);
// Initialize result to 0
q.submit( [&](sycl::handler& h) {
sycl::accessor res(bufres, h, sycl::write_only);
h.single_task([=]() {
res[0] = 0.0;
});
});
// Manual synchronization
q.wait();
// Perform dot product using SYCL lambda kernel and OneAPI built-in reduction
q.submit( [&](sycl::handler& h) {
auto red = sycl::reduction(bufres, h, sycl::plus<>());
h.parallel_for(sycl::range<1>(N), red, [=](sycl::id<1> i, auto &tmp) {
double prod = vecA[i] * vecB[i];
tmp += prod;
});
});
// Transfer result to host and synchronize
// Note: host_accessor is blocking (thus can be used to synchronize)
sycl::host_accessor res(bufres, sycl::read_only);
// Check value (using relative error) and print to stdout
double tol = 1.0e-10;
double check = (double)N * ((double)N - 1) * (2.0*(double)N - 1.0) / 3.0;
if (std::fabs(res[0]/check - 1.0) > tol) {
std::cout << "Error! Result = " << res[0]
<< " when it should be " << check << std::endl;
} else {
std::cout << "Success! Result = " << res[0] << std::endl;
}
// Clean up
sycl::free(vecA, q);
sycl::free(vecB, q);
sycl::free(result, q);
return 0;
}
Now, for some reason, when I pass in a vector length of N = 1 billion, the results are wrong:
$ ./ddot.icpx.x 1000000000
Using device Intel(R) Iris(R) Pro Graphics P580 [0x193a]
Error! Result = 5.63507e+26 when it should be 6.66667e+26
The specific device involved is Intel Xeon CPU E3-1585L v5, which is a Skylake.
If it helps, here is the compiler version string:
$ icpx --version
Intel(R) oneAPI DPC++/C++ Compiler 2023.0.0 (2023.0.0.20221201)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /opt/intel/oneapi/compiler/2023.0.0/linux/bin-llvm
Configuration file: /opt/intel/oneapi/compiler/2023.0.0/linux/bin-llvm/../bin/icpx.cfg
Am I hitting an "invisible" hardware barrier on the maximum value for sycl::range<1>(N)
for this device? If yes, what should I get from sycl::info::device
so I can add a block in my code to cap N to this number?