Using clEnqueueNDRangeKernel in OpenCL

Question

I need help with one function in OpenCL. When I'm starting using clEnqueueNDRangeKernel instead of clEnqueueTask it takes much more time for program to succeed. Why so? As I understand, the program should use data parallel model and it will work faster, am I wrong? And if I am, how I can change code to see the actual work of data parallel model?

__kernel void black_white_img(__global unsigned char *pDataIn, __global unsigned char *pDataOut, unsigned int InSize, unsigned int OutSize)
{
    for (int i = 0, j = 0; i < InSize; i+=4, j++)
    {
        unsigned char Value = (pDataIn[i] + pDataIn[i + 1] + pDataIn[i + 2]) / 3;
        pDataOut[j] = Value;
    }
}

int iWidth, iHeight, iBpp;
vector<unsigned char> pDataIn;
vector<unsigned char> pDataOut;


int err = LoadBmpFile(L"3840x2160.bmp", iWidth, iHeight, iBpp, pDataIn);

if (err != 0 || pDataIn.size() == 0 || iBpp != 32)
{
    std::cout << "error load input file!\n";
}

pDataOut.resize(pDataIn.size()/4);


cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem memobj = NULL;
cl_mem memobj1 = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_platform_id platform_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;

unsigned int SizeIn, SizeOut;

SizeIn = pDataIn.size();
SizeOut = pDataOut.size();

FILE *fp;
char fileName[] = "./kernel.cl";
char *source_str;
size_t source_size;

//Loading kernel
fp = fopen(fileName, "r");
if (!fp) {
    fprintf(stderr, "Failed to load kernel.\n");
    system("PAUSE");
    exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);

//Getting Platform and Device
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);

ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);


//Create context
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);


//create kernel program
program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
(const size_t *)&source_size, &ret);

//build it
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

//create queue
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

//create bufer
memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, pDataIn.size(), NULL, &ret);

memobj1 = clCreateBuffer(context, CL_MEM_READ_WRITE,pDataOut.size(), NULL, &ret);
//copy buffer to kernel

ret = clEnqueueWriteBuffer(command_queue, memobj, CL_TRUE, 0, pDataIn.size(), pDataIn.data(), 0, NULL, NULL);


//create opencl kernel
kernel = clCreateKernel(program, "red_to_green", &ret);


//set kernel args
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobj1);
ret = clSetKernelArg(kernel, 2, sizeof(unsigned int), (void *)&SizeIn);
ret = clSetKernelArg(kernel, 3, sizeof(unsigned int), (void *)&SizeOut);

const size_t cycles_max = 10;
clock_t t0 = clock();
for (int i = 0; i<cycles_max; i++){

    float start_time =  clock();
    float search_time = 0;
    //float last_time = 0;

    //execute opencl kernel
    //ret = clEnqueueTask(command_queue, kernel, 0, NULL, NULL);

    size_t global_item_size = 8;
    size_t local_item_size = 4;

    ret = clEnqueueNDRangeKernel(command_queue,kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);

    //copy from buffer
    ret = clEnqueueReadBuffer(command_queue, memobj1, CL_TRUE, 0, pDataOut.size(), pDataOut.data(), 0, NULL, NULL);

    ret = clFinish(command_queue);

    float end_time = clock();
    search_time = end_time - start_time;
    //float last_time = last_time + search_time;
    cout << search_time << endl;

}

clock_t t1 = clock();
double time_seconds = (t1-t0)*CLOCKS_PER_SEC/cycles_max;
cout << time_seconds/1000 <<endl;
WriteBmpFile(L"3840x2160_wb.bmp", iWidth, iHeight, 8, pDataOut.size(), pDataOut.data(), false);
system("PAUSE");

score 5 · Accepted Answer · answered Feb 18 '16 at 17:11

5

from the docs page:

The kernel is executed using a single work-item.

clEnqueueTask is equivalent to calling clEnqueueNDRangeKernel with work_dim = 1, global_work_offset = NULL, global_work_size[0] set to 1, and local_work_size[0] set to 1.

When you use clEnqueueNDRangeKernel, you are using 2 work groups of 4 work items, but they are all doing the same work. They all read from the same global memory, but more importantly, they all try to write to the same locations in global memory.

You need to take into account the worker's global id when doing your computations.

__kernel void black_white_img(__global unsigned char *pDataIn, __global unsigned char *pDataOut, unsigned int InSize, unsigned int OutSize)
{
    int gid = get_global_id(0);
    int gsize = get_global_size(0);

    for (int j = gid; j < (InSize >> 2); j+= gsize)
    {
        unsigned char Value = (pDataIn[j*4] + pDataIn[j*4 + 1] + pDataIn[j*4 + 2]) / 3;
        pDataOut[j] = Value;
    }
}

answered Feb 18 '16 at 17:11

mfa

5,017
2
23
28

And with clEnqueueTask I'm using 1 work group with 1 work item, am I right? To make it clear, yep. Thanks for answer! But still it works slower (50 ms vs 20-30 ms) than using clEnqueueTask. Is it okay or am I doing something wrong? Can you also explain how global_item_size and local_item_size works and why do we need them in this function and don't need in clEnqueueTask? – Generwp Feb 18 '16 at 17:27
Try a single work group ie: global_item_size = local_item_size = 64. Also, make sure InSize is large -- I'd go with at least 10k, but 1 million plus would be better still. The parallelism won't be evident until there is a lot of work to be done. – mfa Feb 18 '16 at 17:52
Tried that, but still works slower. Maybe you can give me some links where I can study this theme deeper? Anyway thanks a lot! – Generwp Feb 18 '16 at 18:18

score 3 · Answer 2 · answered Feb 20 '16 at 02:56

It looks like you are iterating over all pixels of an input image in your kernel. This will cause all threads to calculate the image intensity for all pixels. Try to launch a single thread for each pixel instead. To do so, change your kernel source code to only calculate the output value for one pixel:

__kernel void black_white_img(__global unsigned char *pDataIn, __global unsigned char *pDataOut) {
    int j = get_global_id(0);
    int i = j*4;
    pDataOut[i] = (pDataIn[j] + pDataIn[j + 1] + pDataIn[j + 2]) / 3;
}

This code will now perform the averaging over the RGB values of your RGBA input image for the single pixel at location i. Now all you need to do is launch as many threads as your image has pixels. Relevant changes:

//create opencl kernel
kernel = clCreateKernel(program, "black_white_img", &ret);


//set kernel args
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobj1);

const size_t cycles_max = 10;
clock_t t0 = clock();
for (int i = 0; i<cycles_max; i++){

float start_time =  clock();
float search_time = 0;
//float last_time = 0;

//execute opencl kernel
//ret = clEnqueueTask(command_queue, kernel, 0, NULL, NULL);

size_t global_item_size = iWidth * iHeight;

ret = clEnqueueNDRangeKernel(command_queue,kernel, 1, NULL, &global_item_size, NULL, 0, NULL, NULL);

This should give a considerable speedup comparing to your code.

Using clEnqueueNDRangeKernel in OpenCL

2 Answers2