I need to allocate as much of struct Things as GPGPU memory allows and invoke the kernel for every struct Thing.
OpenCL disallows allocating all the CL_DEVICE_GLOBAL_MEM_SIZE memory at once - you can allocate at most CL_DEVICE_MAX_MEM_ALLOC_SIZE per single allocation. The second one is regularly 4 times less then all the memory. So I decided to create 4 buffers.
Also, you cannot use pointers to pointers within both OpenCL kernels and when passing args to kernels from host, so you can't pass an array of buffers to the kernel (since every buffer is a pointer to first struct Thing in an array).
So far my kernel code is like this:
kernel void workWithThings(
constant uint64_t t1Count,
global struct Thing * t1,
constant uint64_t t2Count,
global struct Thing * t2,
constant uint64_t t3Count,
global struct Thing * t3,
constant uint64_t t4Count,
global struct Thing * t4
)
{
private ulong gid = get_global_id( 0 );
private struct Thing * t;
if ( gid > t1Count )
{
gid -= t1Count;
if ( gid > t2Count )
{
gid -= t2Count;
if ( gid > t3Count )
{
gid -= t3Count;
t = & t4[ gid ];
}
else
{
t = & t3[ gid ];
}
}
else
{
t = & t2[ gid ];
}
}
else
{
t = & t1[ gid ];
}
//do the actual work:
//t->...
}
Is this really the only way to do it? I feel very stupid writing code like this. Please help.