how to Modify CUDA kernel code to OpenCL for matrix multiplication using tiling approach?

Question

I recently started to work with OpenCl and i trying to change my CUDA code that does the tiled matrix multiplication to OpenCl. I have done some change but i am not sure if i am doing correct. I don't know what would be the blockIdx, threadIdx, __syncthreads, Ashare and Bshare in opencl. I will be more than happy if any one can help me.

My CUDA Kernel code for tiled matrix multiplication:

#define TILE_WIDTH 16
__global__ void matrixMulKernel(float* A, float* B, float* C, int width) {
 __shared__ float Ashare[TILE_WIDTH][TILE_WIDTH];
 __shared__ float Bshare[TILE_WIDTH][TILE_WIDTH];
 int bx = blockIdx.x, by = blockIdx.y;
 int tx = threadIdx.x, ty = threadIdx.y;
 //calculate the row and column for this element of the matrix
 int row = by * TILE_WIDTH + ty;
 int col = bx * TILE_WIDTH + tx;
 float result = 0;
 //loop over the A and B tiles required to compute the C element
 for (int m = 0; m < width / TILE_WIDTH; m++) {
 //collectively load the A and B tiles into shared memory
 Ashare[ty][tx] = A[(row * width) + (m * TILE_WIDTH) + tx];
 Bshare[ty][tx] = B[(((m * TILE_WIDTH) + ty) * width) + col];
 __syncthreads(); //wait for all the shared memory to be loaded

 for (int k = 0; k < TILE_WIDTH; k++) {
 result += A[ty][k] * B[k][tx];
 }
 __syncthreads(); 
 }
     C[(row * width) + col] = result;
}

There is barrier(CLK_LOCAL_MEM_FENCE) for local memory sync. You should put error codes here so people can help better. — huseyin tugrul buyukisik, Dec 08 '17 at 23:02

score 1 · Answer 1 · answered Dec 09 '17 at 19:53

although you have __local__ it should be __local
the translations for threadIdx.x, blockIdx.x, etc. are given here
as already indicated, the translation for __syncthreads() is given here
there are some other errors in your kernel code, for example you use width but have defined A_width and B_width only, also, in your multiply loop you are using A[][] and B[][] but it should be Ashare[][] and Bshare[][]

here's a fully-worked example showing the changes and fixes:

$ cat t5.cpp
#include <CL/opencl.h>
#include <stdio.h>
#include <stdlib.h>

#define TILE_WIDTH 16
#define DS 1024

const char source[] =
"__kernel void matrix_multiply(__global float *A, __global float *B,"
" __global float *C, int width)"
"{"
"   __local float Ashare[TILE_WIDTH][TILE_WIDTH];"
"   __local float Bshare[TILE_WIDTH][TILE_WIDTH];"
"   int bx = get_group_id(0);"
"   int by = get_group_id(1);"
"   int tx = get_local_id(0);"
"   int ty = get_local_id(1);"
"   int row = by * TILE_WIDTH + ty;"
"   int col = bx * TILE_WIDTH + tx;"
"   float result = 0;"
"   for (int m = 0; m < width / TILE_WIDTH; m++) {"
"     Ashare[ty][tx] = A[(row * width) + (m * TILE_WIDTH) + tx];"
"     Bshare[ty][tx] = B[(((m * TILE_WIDTH) + ty) * width) + col];"
"     barrier(CLK_LOCAL_MEM_FENCE); "
"       for (int k = 0; k < TILE_WIDTH; k++) {"
"         result += Ashare[ty][k] * Bshare[k][tx];"
"       }"
"     barrier(CLK_LOCAL_MEM_FENCE); "
"   }"
"   C[(row * width) + col] = result;"
"};"

;

int main(int argc, char *argv[])
{
  cl_platform_id platform;
  cl_device_id device;
  cl_context context;
  cl_command_queue queue1, queue2;
  cl_program program;
  cl_mem mem1, mem2, mem3;
  cl_kernel kernel;
  cl_int err;

  err = clGetPlatformIDs(1, &platform, NULL);
  if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
  err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
  if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
  context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
  queue1 = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, NULL);

  const char *sources[1] = {source};
  program = clCreateProgramWithSource(context, 1, sources, NULL, &err);
  if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
  err = clBuildProgram(program, 1, &device, "-D TILE_WIDTH=16", NULL, NULL);
  if (err == CL_BUILD_PROGRAM_FAILURE) {
    // Determine the size of the log
    size_t log_size;
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

    // Allocate memory for the log
    char *log = (char *) malloc(log_size);

    // Get the log
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);

    // Print the log
    printf("%s\n", log);
  }

  if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
  mem1 = clCreateBuffer(context, CL_MEM_READ_WRITE, DS*DS*sizeof(float), NULL, NULL);
  mem2 = clCreateBuffer(context, CL_MEM_READ_WRITE, DS*DS*sizeof(float), NULL, NULL);
  mem3 = clCreateBuffer(context, CL_MEM_READ_WRITE, DS*DS*sizeof(float), NULL, NULL);
  float *hdata = new float[DS*DS];
  for (int i = 0; i < DS*DS; i++) hdata[i] = 1;
  kernel = clCreateKernel(program, "matrix_multiply", &err);
  if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
  const size_t gwork_size[2] = {DS,DS};
  const size_t lwork_size[2] = {TILE_WIDTH,TILE_WIDTH};
  int msize = DS;
  err = clSetKernelArg(kernel, 0, sizeof(mem1), &mem1);
  if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
  err = clSetKernelArg(kernel, 1, sizeof(mem2), &mem2);
  if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
  err = clSetKernelArg(kernel, 2, sizeof(mem3), &mem3);
  if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
  err = clSetKernelArg(kernel, 3, sizeof(msize), &msize);
  if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}

  err = clEnqueueWriteBuffer(queue1, mem1, CL_TRUE, 0, DS*DS*sizeof(float), hdata, 0, NULL, NULL);
  if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
  err = clEnqueueWriteBuffer(queue1, mem2, CL_TRUE, 0, DS*DS*sizeof(float), hdata, 0, NULL, NULL);
  if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
  err = clEnqueueNDRangeKernel(queue1, kernel, 2, NULL, gwork_size, lwork_size, 0, NULL, NULL);
  if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
  err = clEnqueueReadBuffer(queue1, mem3, CL_TRUE, 0, DS*DS*sizeof(float), hdata, 0, NULL, NULL);
  if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
  for (int i = 0; i < DS*DS; i++)
    if (hdata[i] != DS) {printf("error at %d, was %f, should be %f\n", i, hdata[i], (float)DS); return 1;}
  printf("success!\n");
  return 0;
}
$ g++ -I/usr/local/cuda/include t5.cpp -o t5 -lOpenCL
$ ./t5
success!
$

convenient build log printer picked up from here.

Thanks for help. You are right, when i was posting the code in here i forgot to add the A_width and B_width inside the kernel. — Bilgin, Dec 10 '17 at 02:01
Ashare[ty][tx] = A[(row * A_width) + (m * TILE_WIDTH) + tx]; — Bilgin, Dec 10 '17 at 02:09
Bshare[ty][tx] = B[(((m * TILE_WIDTH) + ty) * B_width) + col]; — Bilgin, Dec 10 '17 at 02:09
the reason why i am using the A_width and B_width is that because i am using command line argument for creating the matrix A and B in my main opencl code and i am using their width in the kernel. Please check my above code in comment and let me know if i replace them in the kernel function it will be make sense. thanks — Bilgin, Dec 10 '17 at 02:12

how to Modify CUDA kernel code to OpenCL for matrix multiplication using tiling approach?

1 Answers1