- although you have
__local__
it should be __local
- the translations for
threadIdx.x
, blockIdx.x
, etc. are given here
- as already indicated, the translation for
__syncthreads()
is given here
- there are some other errors in your kernel code, for example you use
width
but have defined A_width
and B_width
only, also, in your multiply loop you are using A[][]
and B[][]
but it should be Ashare[][]
and Bshare[][]
here's a fully-worked example showing the changes and fixes:
$ cat t5.cpp
#include <CL/opencl.h>
#include <stdio.h>
#include <stdlib.h>
#define TILE_WIDTH 16
#define DS 1024
const char source[] =
"__kernel void matrix_multiply(__global float *A, __global float *B,"
" __global float *C, int width)"
"{"
" __local float Ashare[TILE_WIDTH][TILE_WIDTH];"
" __local float Bshare[TILE_WIDTH][TILE_WIDTH];"
" int bx = get_group_id(0);"
" int by = get_group_id(1);"
" int tx = get_local_id(0);"
" int ty = get_local_id(1);"
" int row = by * TILE_WIDTH + ty;"
" int col = bx * TILE_WIDTH + tx;"
" float result = 0;"
" for (int m = 0; m < width / TILE_WIDTH; m++) {"
" Ashare[ty][tx] = A[(row * width) + (m * TILE_WIDTH) + tx];"
" Bshare[ty][tx] = B[(((m * TILE_WIDTH) + ty) * width) + col];"
" barrier(CLK_LOCAL_MEM_FENCE); "
" for (int k = 0; k < TILE_WIDTH; k++) {"
" result += Ashare[ty][k] * Bshare[k][tx];"
" }"
" barrier(CLK_LOCAL_MEM_FENCE); "
" }"
" C[(row * width) + col] = result;"
"};"
;
int main(int argc, char *argv[])
{
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue1, queue2;
cl_program program;
cl_mem mem1, mem2, mem3;
cl_kernel kernel;
cl_int err;
err = clGetPlatformIDs(1, &platform, NULL);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
queue1 = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, NULL);
const char *sources[1] = {source};
program = clCreateProgramWithSource(context, 1, sources, NULL, &err);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
err = clBuildProgram(program, 1, &device, "-D TILE_WIDTH=16", NULL, NULL);
if (err == CL_BUILD_PROGRAM_FAILURE) {
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
// Allocate memory for the log
char *log = (char *) malloc(log_size);
// Get the log
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
printf("%s\n", log);
}
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
mem1 = clCreateBuffer(context, CL_MEM_READ_WRITE, DS*DS*sizeof(float), NULL, NULL);
mem2 = clCreateBuffer(context, CL_MEM_READ_WRITE, DS*DS*sizeof(float), NULL, NULL);
mem3 = clCreateBuffer(context, CL_MEM_READ_WRITE, DS*DS*sizeof(float), NULL, NULL);
float *hdata = new float[DS*DS];
for (int i = 0; i < DS*DS; i++) hdata[i] = 1;
kernel = clCreateKernel(program, "matrix_multiply", &err);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
const size_t gwork_size[2] = {DS,DS};
const size_t lwork_size[2] = {TILE_WIDTH,TILE_WIDTH};
int msize = DS;
err = clSetKernelArg(kernel, 0, sizeof(mem1), &mem1);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
err = clSetKernelArg(kernel, 1, sizeof(mem2), &mem2);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
err = clSetKernelArg(kernel, 2, sizeof(mem3), &mem3);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
err = clSetKernelArg(kernel, 3, sizeof(msize), &msize);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
err = clEnqueueWriteBuffer(queue1, mem1, CL_TRUE, 0, DS*DS*sizeof(float), hdata, 0, NULL, NULL);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
err = clEnqueueWriteBuffer(queue1, mem2, CL_TRUE, 0, DS*DS*sizeof(float), hdata, 0, NULL, NULL);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
err = clEnqueueNDRangeKernel(queue1, kernel, 2, NULL, gwork_size, lwork_size, 0, NULL, NULL);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
err = clEnqueueReadBuffer(queue1, mem3, CL_TRUE, 0, DS*DS*sizeof(float), hdata, 0, NULL, NULL);
if (err != CL_SUCCESS) {printf("%d: %d\n", __LINE__, err); return -1;}
for (int i = 0; i < DS*DS; i++)
if (hdata[i] != DS) {printf("error at %d, was %f, should be %f\n", i, hdata[i], (float)DS); return 1;}
printf("success!\n");
return 0;
}
$ g++ -I/usr/local/cuda/include t5.cpp -o t5 -lOpenCL
$ ./t5
success!
$
convenient build log printer picked up from here.