I am writing a function to add numbers of arbitrary length on GPU using OpenCL and Boost::compute as frontend.
No error occurs when the numbers are not so big (4096 bits):
Bits length: 4096
Limbs Size: 65
Result: [03, ffffffffffffffff, ffffffffffffffff, ... , ffffffffffffffff, ffffffffffffffff, fffffffffffffffe]
Expected: [03, ffffffffffffffff, ffffffffffffffff, ... , ffffffffffffffff, ffffffffffffffff, fffffffffffffffe]
But at bigger size of numbers (40960 bits and more) the error occurs in arbitrary digits of the result:
Bits length: 40960
Limbs Size: 641
ERROR limb index: 638
ERROR: [... , fffffffffffffffe, ffffffffffffffff, fffffffffffffffe, ...]
EXPECTED: [... , ffffffffffffffff, ffffffffffffffff, ffffffffffffffff, ...]
...
Bits length: 40960
Limbs Size: 641
ERROR limb index: 626
ERROR: [... , fffffffffffffffe, ffffffffffffffff, ffffffffffffffff, ...]
EXPECTED: [... , ffffffffffffffff, ffffffffffffffff, ffffffffffffffff, ...]
It seems I didn't foresee some kind of thread synchronization issue.
But which one?
Kernel function code:
#include <boost/compute/utility/source.hpp>
const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE (
__kernel void
vaddc64(__global ulong * const a, const __global ulong *b, const uint size, __global ulong * const error)
{
__private size_t gid =get_global_id(0);
__private bool prev_overflow;
a[gid] +=b[gid];
barrier(CLK_GLOBAL_MEM_FENCE);
prev_overflow = gid ? (a[gid - 1] < b[gid - 1]) : false;
barrier(CLK_GLOBAL_MEM_FENCE);
if(prev_overflow)
++a[gid];
//barrier(CLK_GLOBAL_MEM_FENCE);
}
);
Host program code:
#define BOOST_COMPUTE_DEBUG_KERNEL_COMPILATION
#include <boost/compute.hpp>
#include <boost/preprocessor.hpp>
#include "kernel.cl"
namespace bc = boost::compute;
bc::device *gpu;
bc::context *ctx;
bc::command_queue *queue;
bc::program *program;
bc::kernel *vaddc64;
typedef uint64_t LIMB_TYPE;
//----------------------------------------------------------------------
int
main(int argc, char *argv[])
{
ulong BITS_LENGTH =40960;
size_t LIMBS_SIZE =(BITS_LENGTH / (sizeof(LIMB_TYPE) * 8)) + 1;
ulong error_code;
std::printf("Bits length: %lu\n", BITS_LENGTH);
std::printf("Limbs size: %lu\n", LIMBS_SIZE);
//......................................... Host init
LIMB_TYPE a[LIMBS_SIZE], b[LIMBS_SIZE], r[LIMBS_SIZE];
/*** init a[] ***/
for(auto c =0; c < (LIMBS_SIZE - 1); ++c)
a[c] =(LIMB_TYPE)-1; // a[] =0xffff...
a[LIMBS_SIZE - 1] =1; // last (most significant) limb =1
std::copy_n(a, LIMBS_SIZE, b); // b[] =a[]
/*** calculation of expected values ***/
for(auto c =0, carry =0; c < LIMBS_SIZE; ++c)
{
r[c] =a[c] + b[c] + carry;
if((r[c] - carry) < b[c] || (carry & !r[c]))
carry =1;
else
carry =0;
}
//......................................... Boost Compute init
gpu = new bc::device(bc::system::default_device());
ctx = new bc::context(*gpu);
queue = new bc::command_queue(*ctx, *gpu);
program = new bc::program(bc::program::create_with_source(source, *ctx));
program->build();
vaddc64 = new bc::kernel(*program, "vaddc64");
//......................................... GPU init
bc::vector<LIMB_TYPE> gpu_a(LIMBS_SIZE, (LIMB_TYPE)0, *queue);
bc::vector<LIMB_TYPE> gpu_b(LIMBS_SIZE, (LIMB_TYPE)0, *queue);
bc::detail::scalar<ulong> gpu_error(*ctx);
bc::copy(a, a + LIMBS_SIZE, gpu_a.begin(), *queue);
bc::copy(b, b + LIMBS_SIZE, gpu_b.begin(), *queue);
/*** GPU kernel call ***/
vaddc64->set_arg(0, gpu_a);
vaddc64->set_arg(1, gpu_b);
vaddc64->set_arg(2, LIMBS_SIZE);
vaddc64->set_arg(3, gpu_error.get_buffer());
queue->enqueue_1d_range_kernel(*vaddc64, 0, LIMBS_SIZE, 0);
//......................................... get GPU results
error_code =gpu_error.read(*queue);
bc::copy(gpu_a.begin(), gpu_a.end(), a, *queue);
//......................................... Check results
auto err_idx =LIMBS_SIZE;
while(--err_idx != (-1UL))
if(a[err_idx] ^ r[err_idx])
break;
if(err_idx == (-1UL))
{
std::printf("Result: [%02lx, %02lx, %02lx, ... , %02lx, %02lx, %02lx]\n", a[LIMBS_SIZE - 1], a[LIMBS_SIZE - 2], a[LIMBS_SIZE - 3], a[2], a[1], a[0]);
std::printf("Expected: [%02lx, %02lx, %02lx, ... , %02lx, %02lx, %02lx]\n", r[LIMBS_SIZE - 1], r[LIMBS_SIZE - 2], r[LIMBS_SIZE - 3], r[2], r[1], r[0]);
}
else
{
std::printf("ERROR limb index: %lu\n", err_idx);
std::printf("ERROR: [... , %02lx, %02lx, %02lx, ...]\n", a[err_idx], a[err_idx - 1], a[err_idx - 2]);
std::printf("EXPECTED: [... , %02lx, %02lx, %02lx, ...]\n", r[err_idx], r[err_idx - 1], r[err_idx - 2]);
}
//......................................... cleanup
delete vaddc64;
delete program;
delete queue;
delete ctx;
delete gpu;
}
I assumed that using the barrier( ... ) function to synchronize threads is enough. Or is the problem in something else?