1

I have written a program for computing the matrix product on the GPU.

My problem is that for large matrices the Catalyst-drivers crash. I am aware of the potential problem caused by timeout detection for long computations, but my computations are fairly quick so I don't think this is a problem.

I have, in essence, three different iterations of the code. The first is just a naive implementation of matrix multiplikation that uses the built in functionality in OpenCL to determine work group size (WGS). This version works fine, but is inefficient due to bandwidth imitations (i guess)!

The second version specifies WGS manually. The workgroups are in this case thin slivers {256,1,1} in size (in order to be able to multiply matrices which have dimensions specified by prime numbers). The WGS is a multipel of the CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE returned by clGetKernelWorkGroupInfo(). This version crashes when using matrices larger than roughly 4000 x 4000. It is also significantly slower than the first version!

The third version is like the second, except it uses local memory. This versions crashes for matrices larger than about 2000 x 2000. It is by far the fastest for the matrix sizes it actually works.

I am on Windows 8.1 using MinGW64 with gcc (can check version if needed, don't remember). I use an AMD R9 290 with CCC 14.9 drivers.

Useful links

Khronos OpenCL 1.2 Reference pages

My code is roughly based on the code and writings on this blog

Kernel (local memory version):

    __kernel void matMult(__global float* A,
                    __global float* B,
                    __global float* C,
                    int m, int p)
{
    int a, b, k, group_idx, group_idz, tx, ty;
    const int wgsize = get_local_size(0);
    float value;

    group_idx = get_group_id(0);
    group_idz = get_group_id(2);
    tx = get_local_id(0);
    ty = get_local_id(2);

    if(tx >= p)  {
        //printf("Thread %d exiting ...\n", tx);
        return;
    }

    // Index of the first sub-matrix of A processed 
    // group_idz the block
    int aBegin = m * wgsize * group_idz;

    // Index of the last sub-matrix of A processed 
    // group_idz the block
    int aEnd   = aBegin + m - 1;

    // Step size used to iterate through the 
    // sub-matrices of A
    int aStep  = wgsize;

    // Index of the first sub-matrix of B processed 
    // group_idz the block
    int bBegin = wgsize * group_idx;

    // Step size used to iterate through the 
    // sub-matrices of B
    int bStep  = wgsize * p;

    // Loop over all the sub-matrices of A and B
    // required to compute the block sub-matrix
    for (a = aBegin, b = bBegin;
             a <= aEnd;
             a += aStep, b += bStep) 
    {

        // Declaration of the local memory array As 
        // used to store the sub-matrix of A
        __local float As[256][1];

        // Declaration of the local memory array Bs 
        // used to store the sub-matrix of B
        __local float Bs[256][1];

        // Load the matrices from global memory
        // to local memory; each thread loads
        // one element of each matrix
        As[ty][tx] = A[a + m * ty + tx];
        Bs[ty][tx] = B[b + p * ty + tx];

        // Synchronize to make sure the matrices 
        // are loaded
        barrier(CLK_LOCAL_MEM_FENCE);

        // Multiply the two matrices together;
        // each thread computes one element
        // of the block sub-matrix
        for (k = 0; k < wgsize; k++)
            value += As[ty][k] * Bs[k][tx];

        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        barrier(CLK_LOCAL_MEM_FENCE);
    }

    //printf("value: %f\n", value);
    int c = p * wgsize * group_idz + wgsize * group_idx;
    C[c + p * ty + tx] = value;
}

Host code:

int main(int argc, const char * argv[])
{
    ...

    //Allocate memory for and generate test data on host for matrix multiplication
    float* hostA = allocMatrix(n, m);
    float* hostB = allocMatrix(m, p);

    //Allocate results array on host
    float* hostC = (float *)malloc(sizeof(float) * p * n);

    //Setup the objects OpenCL needs in order to function
    if(SetupCL(&context, properties, &kernel, &command_queue, &program, &platform_id, &device_id, suppressoutp, usecpu))  {
        printf("Failed to setup OpenCL\n");
        return -1;
    }


    //10. Allocate memory on device
    cl_mem devA  = clCreateBuffer(context, CL_MEM_READ_ONLY,
                                 sizeof(cl_float) * m * n, NULL, &err);

    cl_mem devB  = clCreateBuffer(context, CL_MEM_READ_ONLY,
                                 sizeof(cl_float) * p * m, NULL, &err);

    cl_mem devC  = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                                 sizeof(cl_float) * p * n, NULL, &err);

    //Load data into the input buffer
    clEnqueueWriteBuffer(command_queue, devA, CL_TRUE, 0,
                         sizeof(float) * m * n, hostA, 0, NULL, NULL);
    clEnqueueWriteBuffer(command_queue, devB, CL_TRUE, 0,
                         sizeof(float) * m * p, hostB, 0, NULL, NULL);


    //11. Set the argument list for the kernel command
    int wa = m;
    int wb = p;
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &devA);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &devB);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &devC);
    clSetKernelArg(kernel, 3, sizeof(int), &wa);
    clSetKernelArg(kernel, 4, sizeof(int), &wb);


    //Fetch information about compute device
    unsigned int pref_workg_size_mult;
    const unsigned int max_workg_size;
    const unsigned int max_workit_sizes[3];

    clGetKernelWorkGroupInfo(kernel, device_id,
        CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
        sizeof(size_t), (void*) &pref_workg_size_mult, NULL);

    clGetDeviceInfo(device_id, 
        CL_DEVICE_MAX_WORK_GROUP_SIZE,
        sizeof(size_t), (void*) &max_workg_size, NULL);

    clGetDeviceInfo(device_id, 
        CL_DEVICE_MAX_WORK_ITEM_SIZES,
        sizeof(size_t) * 3, (void*) max_workit_sizes, NULL);


    //Determine work group size
    int k = 1, s = 1;
    if (pref_workg_size_mult == 0)
        pref_workg_size_mult = 1;

    while(k * pref_workg_size_mult < n && k * pref_workg_size_mult < max_workg_size)
        k++;
    while(k *s * pref_workg_size_mult < n)
        s++;

    const size_t work_group_size[3] = {k * pref_workg_size_mult, 1, 1};
    const size_t global_work_size[3] = {k * s * pref_workg_size_mult, 1, p};


    //12. Enqueue the kernel command for execution
    cl_event event0;

    cl_int enqueue_error = clEnqueueNDRangeKernel(command_queue, kernel, 3, NULL, global_work_size,
                           work_group_size, 0, NULL, &event0);
    if (enqueue_error != CL_SUCCESS)
    {
        printf("Kernel launch failed, error %d\n", enqueue_error);
        return enqueue_error;
    }
    clWaitForEvents(1, &event0);


    //Call profiling function to obtain starting and ending times of kernel execution
    clGetEventProfilingInfo(event0, CL_PROFILING_COMMAND_START,
                                sizeof(cl_ulong), &start, NULL);
    clGetEventProfilingInfo(event0, CL_PROFILING_COMMAND_END,
                                sizeof(cl_ulong), &end, NULL);
    duration = end - start;



    //13. Copy the results from out of the output buffer
    clEnqueueReadBuffer(command_queue, devC, CL_TRUE, 0,
                                sizeof(float) * p * n, hostC, 0, NULL, NULL);


    //14. Cleanup - release OpenCL resources
    clReleaseMemObject(devA);
    clReleaseMemObject(devB);
    clReleaseMemObject(devC);
    clReleaseEvent(event0);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(command_queue);
    clReleaseContext(context);


    //Release host memory
    free(hostA);
    free(hostB);
    free(hostC);

    return 0;
}
Community
  • 1
  • 1
Bazooka
  • 51
  • 1
  • 5
  • If your calculations take to long you will definitely trip the Windows watchdog timer. Can you post your kernel and abbreviated host code so we can make sure you aren't calling 4000 by 4000 kernels that each operate on the 4000 by 4000 matrices? – Austin Nov 13 '14 at 17:24
  • Thank you for your reply. I have posted kernel and host code (slightly abbreviated). I have an additional funktion setupCL that sets up all the objekts needed that I can post upon request. I must say that what you suggest seems highly unlikely to me. – Bazooka Nov 13 '14 at 18:48
  • When the code completes, do you get the correct answer in any of your scenarios? – Austin Nov 13 '14 at 20:23
  • Yes, as far as I have been able to determine, the code produces correct results. – Bazooka Nov 13 '14 at 21:10
  • - Why do you use get_local_id(2), if your working sizes are all defined as 1D? - Why do you declare you local memory like this: `[256][1]`? Why do you access it like this: `As[ty][tx]`, especially if tx is not always 0? – Baiz Nov 13 '14 at 23:13
  • Thank you for your reply, Baiz. My sizes are defined as 3D actually (due to problems i encountered that resolved themselves when i defined them in this way, haven't tried reverting them however), i just used thin strips due to reasons stated in OP. Since the matrices are 2D I would say this is the minimum number of dimensions for intuitive indexing conventions. The declarations and accesses in the kernel are due to me simply adapting the code in the **[blog i linked](http://gpgpu-computing4.blogspot.se/2009/10/matrix-multiplication-3-opencl.html)** – Bazooka Nov 14 '14 at 04:44
  • You should rewrite your kernel code so it treats the problem as 1-dimensional. And also use a function that handles 2D memory access linearly: `int indexFrom2D(int x, int y, int width) { return y*width + x; }` I suspect you're accessing local memory out of bounds and, by formulating it as a 3D problem, you obfuscated this fact even more. – Baiz Nov 14 '14 at 12:50
  • You're using `As[ty][tx]`, but `As` was declared as `__local float As[256][1];` This of course will fail, as soon as tx is larger than 0, because you will exeed your local memory (e.g. tx = 1, ty = 4, so you'll access the element at index 260 which does not exist because your size is only 256). – Baiz Nov 14 '14 at 17:55
  • Since the max. WGS is {256,1,1} due to it being bounded by `CL_DEVICE_MAX_WORK_GROUP_SIZE` it seems it's a logical impossibility for either `tx`or `ty` to go out of bounds in the way you describe. The work group sizes are determined by the following` //Determine work group size int k = 1, s = 1; if (pref_workg_size_mult == 0) pref_workg_size_mult = 1; while(k * pref_workg_size_mult < n && k * pref_workg_size_mult < max_workg_size) k++; while(k *s * pref_workg_size_mult < n) s++; const size_t work_group_size[3] = {k * pref_workg_size_mult, 1, 1};` – Bazooka Nov 16 '14 at 05:47

0 Answers0