I'm asking this here because I thought I've understood how OpenCL works but... I think there are several things I don't get.
What I want to do is to get the difference between all the values of two arrays, then calculate the hypot and finally get the maximum hypot value, so If I have:
double[] arrA = new double[]{1,2,3}
double[] arrB = new double[]{6,7,8}
Calculate
dx1 = 1 - 1; dx2 = 2 - 1; dx3 = 3 - 1, dx4= 1 - 2;... dxLast = 3 - 3
dy1 = 6 - 6; dy2 = 7 - 6; dy3 = 8 - 6, dy4= 6 - 7;... dyLast = 8 - 8
(Extreme dx and dy will get 0, but i don't care about ignoring those cases by now)
Then calculate each hypot based on hypot(dx(i), dy(i)) And once all these values where obtained, get the maximum hypot value
So, I have the next defined Kernel:
String programSource =
"#ifdef cl_khr_fp64 \n"
+ " #pragma OPENCL EXTENSION cl_khr_fp64 : enable \n"
+ "#elif defined(cl_amd_fp64) \n"
+ " #pragma OPENCL EXTENSION cl_amd_fp64 : enable \n"
+ "#else "
+ " #error Double precision floating point not supported by OpenCL implementation.\n"
+ "#endif \n"
+ "__kernel void "
+ "sampleKernel(__global const double *bufferX,"
+ " __global const double *bufferY,"
+ " __local double* scratch,"
+ " __global double* result,"
+ " __const int lengthX,"
+ " __const int lengthY){"
+ " const int index_a = get_global_id(0);"//Get the global indexes for 2D reference
+ " const int index_b = get_global_id(1);"
+ " const int local_index = get_local_id(0);"//Current thread id -> Should be the same as index_a * index_b + index_b;
+ " if (local_index < (lengthX * lengthY)) {"// Load data into local memory
+ " if(index_a < lengthX && index_b < lengthY)"
+ " {"
+ " double dx = (bufferX[index_b] - bufferX[index_a]);"
+ " double dy = (bufferY[index_b] - bufferY[index_a]);"
+ " scratch[local_index] = hypot(dx, dy);"
+ " }"
+ " } "
+ " else {"
+ " scratch[local_index] = 0;"// Infinity is the identity element for the min operation
+ " }"
//Make a Barrier to make sure all values were set into the local array
+ " barrier(CLK_LOCAL_MEM_FENCE);"
//If someone can explain to me the offset thing I'll really apreciate that...
//I just know there is alway a division by 2
+ " for(int offset = get_local_size(0) / 2; offset > 0; offset >>= 1) {"
+ " if (local_index < offset) {"
+ " float other = scratch[local_index + offset];"
+ " float mine = scratch[local_index];"
+ " scratch[local_index] = (mine > other) ? mine : other;"
+ " }"
+ " barrier(CLK_LOCAL_MEM_FENCE);"
//A barrier to make sure that all values where checked
+ " }"
+ " if (local_index == 0) {"
+ " result[get_group_id(0)] = scratch[0];"
+ " }"
+ "}";
For this case, the defined GWG size is (100, 100, 0) and a LWI size of (10, 10, 0).
So, for this example, both arrays have size 10 and the GWG and LWI are obtained as follows:
//clGetKernelWorkGroupInfo(kernel, device, CL.CL_KERNEL_WORK_GROUP_SIZE, Sizeof.size_t, Pointer.to(buffer), null);
long kernel_work_group_size = OpenClUtil.getKernelWorkGroupSize(kernel, device.getCl_device_id(), 3);
//clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, Sizeof.size_t * numValues, Pointer.to(buffer), null);
long[] maxSize = device.getMaximumSizes();
maxSize[0] = ( kernel_work_group_size > maxSize[0] ? maxSize[0] : kernel_work_group_size);
maxSize[1] = ( kernel_work_group_size > maxSize[1] ? maxSize[1] : kernel_work_group_size);
maxSize[2] = ( kernel_work_group_size > maxSize[2] ? maxSize[2] : kernel_work_group_size);
// maxSize[2] =
long xMaxSize = (x > maxSize[0] ? maxSize[0] : x);
long yMaxSize = (y > maxSize[1] ? maxSize[1] : y);
long zMaxSize = (z > maxSize[2] ? maxSize[2] : z);
long local_work_size[] = new long[] { xMaxSize, yMaxSize, zMaxSize };
int numWorkGroupsX = 0;
int numWorkGroupsY = 0;
int numWorkGroupsZ = 0;
if(local_work_size[0] != 0)
numWorkGroupsX = (int) ((total + local_work_size[0] - 1) / local_work_size[0]);
if(local_work_size[1] != 0)
numWorkGroupsY = (int) ((total + local_work_size[1] - 1) / local_work_size[1]);
if(local_work_size[2] != 0)
numWorkGroupsZ = (int) ((total + local_work_size[2] - 1) / local_work_size[2]);
long global_work_size[] = new long[] { numWorkGroupsX * local_work_size[0],
numWorkGroupsY * local_work_size[1], numWorkGroupsZ * local_work_size[2]};
The thing is I'm not getting the espected values so I decided to make some tests based on a smaller kernel and changing the [VARIABLE TO TEST VALUES] object returned in a result array:
/**
* The source code of the OpenCL program to execute
*/
private static String programSourceA =
"#ifdef cl_khr_fp64 \n"
+ " #pragma OPENCL EXTENSION cl_khr_fp64 : enable \n"
+ "#elif defined(cl_amd_fp64) \n"
+ " #pragma OPENCL EXTENSION cl_amd_fp64 : enable \n"
+ "#else "
+ " #error Double precision floating point not supported by OpenCL implementation.\n"
+ "#endif \n"
+ "__kernel void "
+ "sampleKernel(__global const double *bufferX,"
+ " __global const double *bufferY,"
+ " __local double* scratch,"
+ " __global double* result,"
+ " __const int lengthX,"
+ " __const int lengthY){"
//Get the global indexes for 2D reference
+ " const int index_a = get_global_id(0);"
+ " const int index_b = get_global_id(1);"
//Current thread id -> Should be the same as index_a * index_b + index_b;
+ " const int local_index = get_local_id(0);"
// Load data into local memory
//Only print values if index_a < ArrayA length
//Only print values if index_b < ArrayB length
//Only print values if local_index < (lengthX * lengthY)
//Only print values if this is the first work group.
+ " if (local_index < (lengthX * lengthY)) {"
+ " if(index_a < lengthX && index_b < lengthY)"
+ " {"
+ " double dx = (bufferX[index_b] - bufferX[index_a]);"
+ " double dy = (bufferY[index_b] - bufferY[index_a]);"
+ " result[local_index] = hypot(dx, dy);"
+ " }"
+ " } "
+ " else {"
// Infinity is the identity element for the min operation
+ " result[local_index] = 0;"
+ " }"
The returned values are far of being the espected but, if the [VARIABLE TO TEST VALUES] is (index_a * index_b) + index_a, almost each value of the returned array has the correct (index_a * index_b) + index_a value, i mean:
result[0] -> 0
result[1] -> 1
result[2] -> 2
....
result[97] -> 97
result[98] -> 98
result[99] -> 99
but some values are: -3.350700319577517E-308....
What I'm not doing correctly???
I hope this is well explained and not that big to make you angry with me....
Thank you so much!!!!!
TomRacer