I'm delving into OpenCL by making a Matrix dot product implementation. I'm having a problem with getting my kernels to return the same values as my host.
I have made an encapsulation function that allocates device memory, sets parameters to a kernel, runs the kernel and returns the result back to the host.
/* This function runs the matrix dot product on whatever OpenCL device
* you specify
*/
cl_int OpenCL_MatrixMul(cl_device_id * device, cl_context * context,
cl_command_queue * commandQueue, cl_kernel * matrixMulKernel, float * A_h,
float * B_h, float * C_h, const cl_uint HeightA, const cl_uint WidthB,
const cl_uint WidthAHeightB)
{
printf("Inside matrix mul, WidthA: %zu, WidthB: %zu, WidthAHeightB: %zu\n",
HeightA, WidthB, WidthAHeightB);
//this error variable will record any errors found and will be returned
//by this function
cl_int error = CL_SUCCESS;
cl_int clEnqueueReadBuffer_error;
//declare a place for the memory on the device, A is the A matrix,
//B is the B matrix, C is the C result matrix
cl_mem A_d, B_d, C_d;
//this is a temporary value for holding the maximum work group size
size_t maximum_local_ws;
//variable for holding the number of work items per group
size_t local_ws[2];
//variable for holding the number of work items
size_t global_ws[2];
//calcuate work group and local size
//get the maximum work group size for the kernel, i.e. set local_ws
clGetKernelWorkGroupInfo((* matrixMulKernel), (* device),
CL_KERNEL_WORK_GROUP_SIZE, sizeof(maximum_local_ws),
&maximum_local_ws, NULL);
//find the largest integer, power of 2, square root, for maximum_local_ws
//that is less than or equal to 16
for(size_t i = 1; (i * i) <= maximum_local_ws && i <= maxBlockSize; i *= 2)
{
local_ws[0] = i;
local_ws[1] = i;
}
//calculate global work size
global_ws[0] = WidthB;
global_ws[1] = HeightA;
printf("Work group size calculated.\n");
//Allocate global memory on the device
//put A on the device
A_d = clCreateBuffer ((* context), CL_MEM_COPY_HOST_PTR,
(WidthAHeightB * HeightA * sizeof(float)), A_h, &error);
//put B on the device
B_d = clCreateBuffer ((* context), CL_MEM_COPY_HOST_PTR,
(WidthB * WidthAHeightB * sizeof(float)), B_h, &error);
//create a space for C on the device
C_d = clCreateBuffer ((* context), CL_MEM_READ_WRITE,
(HeightA * WidthB * sizeof(float)), NULL, &error);
printf("Global memory allocated.\n");
if(error == CL_SUCCESS)
{
//set the prarameters of the kernels
//Put in A
error = clSetKernelArg((* matrixMulKernel), 0, sizeof(cl_mem), &A_d);
//Put in B
error |= clSetKernelArg((* matrixMulKernel), 1, sizeof(cl_mem), &B_d);
//Put in C
error |= clSetKernelArg((* matrixMulKernel), 2, sizeof(cl_mem), &C_d);
//Put in HeightA
error |= clSetKernelArg((* matrixMulKernel), 3, sizeof(cl_uint),
&HeightA);
//Put in WidthB
error |= clSetKernelArg((* matrixMulKernel), 4, sizeof(cl_uint),
&WidthB);
//Put in WidthAHeightB
error |= clSetKernelArg((* matrixMulKernel), 5, sizeof(cl_uint),
&WidthAHeightB);
printf("Parameters added to the kernel.\n");
if(error == CL_SUCCESS)
{
//execute the kernel
printf("Running Kernel, Local work size: %zu x %zu global worksize:
%zu x %zu, HeightA: %zu, WidthB: %zu, WidthAHeightB: %zu\n",
local_ws[0], local_ws[1], global_ws[0], global_ws[1],
HeightA, WidthB, WidthAHeightB);
error = clEnqueueNDRangeKernel((* commandQueue),
(* matrixMulKernel), 1, NULL, global_ws, local_ws, 0, NULL,
NULL);
printf("Kernel Ran.\n");
if(error == CL_SUCCESS)
{
printf("Kernel Launched Successfully\n");
}
else
{
printf("Kernel Not Launched\n");
}
}
}
else
{
printf("Parameters not added to the kernel.\n");
}
printf("Reading results back from device\n");
//read the result back to the host system, (copy C_h to C_d)
clEnqueueReadBuffer_error = clEnqueueReadBuffer((* commandQueue), C_d,
CL_TRUE, 0, HeightA * WidthB * sizeof(float), C_h, 0, NULL, NULL);
//make sure we don't write over previous errors, if
//clEnqueueReadBuffer_error has an error
if(error == CL_SUCCESS)
{
error = clEnqueueReadBuffer_error;
}
printf("Freeing device memory\n");
//Free global memory on the device
clReleaseMemObject(A_d);
clReleaseMemObject(B_d);
clReleaseMemObject(C_d);
return error;
}
This code, when run, it outputs something strange:
Inside matrix mul, WidthA: 16, WidthB: 16, WidthAHeightB: 16
Work group size calculated.
Global memory allocated.
Parameters added to the kernel.
Running Kernel, Local work size: 1 x 1 global worksize: 16 x 16, HeightA: 16, WidthB: 140733193388048, WidthAHeightB: 16
Kernel Ran.
Kernel Launched Successfully
Reading results back from device
Freeing device memory
For some reason, widthB changed its value from 16 to 140733193388048. The strange thing is, widthB is different, yet WidthA and WidthAHeightB, despite being used the same way, remain the same. Furthermore, the value 140733193388048 remains unusually deterministic throughout all the calls I give to it.
Consequently, the first row, of the matrix, that my device returns, is the same as the host, but the subsequent values are not.
I'm programming on Mac OS X using Apple's OpenCL implementation in Snow Leopard.
What is going on here, and how do you keep something like this from happening?