I have written a program for computing the matrix product on the GPU.
My problem is that for large matrices the Catalyst-drivers crash. I am aware of the potential problem caused by timeout detection for long computations, but my computations are fairly quick so I don't think this is a problem.
I have, in essence, three different iterations of the code. The first is just a naive implementation of matrix multiplikation that uses the built in functionality in OpenCL to determine work group size (WGS). This version works fine, but is inefficient due to bandwidth imitations (i guess)!
The second version specifies WGS manually. The workgroups are in this case thin slivers {256,1,1} in size (in order to be able to multiply matrices which have dimensions specified by prime numbers). The WGS is a multipel of the CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE returned by clGetKernelWorkGroupInfo(). This version crashes when using matrices larger than roughly 4000 x 4000. It is also significantly slower than the first version!
The third version is like the second, except it uses local memory. This versions crashes for matrices larger than about 2000 x 2000. It is by far the fastest for the matrix sizes it actually works.
I am on Windows 8.1 using MinGW64 with gcc (can check version if needed, don't remember). I use an AMD R9 290 with CCC 14.9 drivers.
Useful links
Khronos OpenCL 1.2 Reference pages
My code is roughly based on the code and writings on this blog
Kernel (local memory version):
__kernel void matMult(__global float* A,
__global float* B,
__global float* C,
int m, int p)
{
int a, b, k, group_idx, group_idz, tx, ty;
const int wgsize = get_local_size(0);
float value;
group_idx = get_group_id(0);
group_idz = get_group_id(2);
tx = get_local_id(0);
ty = get_local_id(2);
if(tx >= p) {
//printf("Thread %d exiting ...\n", tx);
return;
}
// Index of the first sub-matrix of A processed
// group_idz the block
int aBegin = m * wgsize * group_idz;
// Index of the last sub-matrix of A processed
// group_idz the block
int aEnd = aBegin + m - 1;
// Step size used to iterate through the
// sub-matrices of A
int aStep = wgsize;
// Index of the first sub-matrix of B processed
// group_idz the block
int bBegin = wgsize * group_idx;
// Step size used to iterate through the
// sub-matrices of B
int bStep = wgsize * p;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (a = aBegin, b = bBegin;
a <= aEnd;
a += aStep, b += bStep)
{
// Declaration of the local memory array As
// used to store the sub-matrix of A
__local float As[256][1];
// Declaration of the local memory array Bs
// used to store the sub-matrix of B
__local float Bs[256][1];
// Load the matrices from global memory
// to local memory; each thread loads
// one element of each matrix
As[ty][tx] = A[a + m * ty + tx];
Bs[ty][tx] = B[b + p * ty + tx];
// Synchronize to make sure the matrices
// are loaded
barrier(CLK_LOCAL_MEM_FENCE);
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
for (k = 0; k < wgsize; k++)
value += As[ty][k] * Bs[k][tx];
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
barrier(CLK_LOCAL_MEM_FENCE);
}
//printf("value: %f\n", value);
int c = p * wgsize * group_idz + wgsize * group_idx;
C[c + p * ty + tx] = value;
}
Host code:
int main(int argc, const char * argv[])
{
...
//Allocate memory for and generate test data on host for matrix multiplication
float* hostA = allocMatrix(n, m);
float* hostB = allocMatrix(m, p);
//Allocate results array on host
float* hostC = (float *)malloc(sizeof(float) * p * n);
//Setup the objects OpenCL needs in order to function
if(SetupCL(&context, properties, &kernel, &command_queue, &program, &platform_id, &device_id, suppressoutp, usecpu)) {
printf("Failed to setup OpenCL\n");
return -1;
}
//10. Allocate memory on device
cl_mem devA = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(cl_float) * m * n, NULL, &err);
cl_mem devB = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(cl_float) * p * m, NULL, &err);
cl_mem devC = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(cl_float) * p * n, NULL, &err);
//Load data into the input buffer
clEnqueueWriteBuffer(command_queue, devA, CL_TRUE, 0,
sizeof(float) * m * n, hostA, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, devB, CL_TRUE, 0,
sizeof(float) * m * p, hostB, 0, NULL, NULL);
//11. Set the argument list for the kernel command
int wa = m;
int wb = p;
clSetKernelArg(kernel, 0, sizeof(cl_mem), &devA);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &devB);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &devC);
clSetKernelArg(kernel, 3, sizeof(int), &wa);
clSetKernelArg(kernel, 4, sizeof(int), &wb);
//Fetch information about compute device
unsigned int pref_workg_size_mult;
const unsigned int max_workg_size;
const unsigned int max_workit_sizes[3];
clGetKernelWorkGroupInfo(kernel, device_id,
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
sizeof(size_t), (void*) &pref_workg_size_mult, NULL);
clGetDeviceInfo(device_id,
CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(size_t), (void*) &max_workg_size, NULL);
clGetDeviceInfo(device_id,
CL_DEVICE_MAX_WORK_ITEM_SIZES,
sizeof(size_t) * 3, (void*) max_workit_sizes, NULL);
//Determine work group size
int k = 1, s = 1;
if (pref_workg_size_mult == 0)
pref_workg_size_mult = 1;
while(k * pref_workg_size_mult < n && k * pref_workg_size_mult < max_workg_size)
k++;
while(k *s * pref_workg_size_mult < n)
s++;
const size_t work_group_size[3] = {k * pref_workg_size_mult, 1, 1};
const size_t global_work_size[3] = {k * s * pref_workg_size_mult, 1, p};
//12. Enqueue the kernel command for execution
cl_event event0;
cl_int enqueue_error = clEnqueueNDRangeKernel(command_queue, kernel, 3, NULL, global_work_size,
work_group_size, 0, NULL, &event0);
if (enqueue_error != CL_SUCCESS)
{
printf("Kernel launch failed, error %d\n", enqueue_error);
return enqueue_error;
}
clWaitForEvents(1, &event0);
//Call profiling function to obtain starting and ending times of kernel execution
clGetEventProfilingInfo(event0, CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &start, NULL);
clGetEventProfilingInfo(event0, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &end, NULL);
duration = end - start;
//13. Copy the results from out of the output buffer
clEnqueueReadBuffer(command_queue, devC, CL_TRUE, 0,
sizeof(float) * p * n, hostC, 0, NULL, NULL);
//14. Cleanup - release OpenCL resources
clReleaseMemObject(devA);
clReleaseMemObject(devB);
clReleaseMemObject(devC);
clReleaseEvent(event0);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
//Release host memory
free(hostA);
free(hostB);
free(hostC);
return 0;
}