Hello i'am new to Open Cl and i was trying to Measure GFLOPS on GPU Vivante7200 Lite on board IMX8
The output using this code is 0.127989 GFLOPS while the advertised FLOPS is 32 So what may be the wrong in this code.
Here is the kernel Code
__kernel void flops(__global float* input , __global float* output) {
int gid = get_global_id(0);
float scale = 2.35;
float x = *input;
int i=0;
for(;i<62500000;++i)
{
scale = scale * (x);
}
output[gid] = scale;
}
and here is the Host Program
#include "gpu_test.h"
int main ()
{
gpu_intialize();
cl_int errNum;
cl_platform_id platform; /* OpenCL platform. */
cl_device_id device; /* OpenCL device. */
cl_context context; /* OpenCL context. */
cl_command_queue commandQueue; /* OpenCL command queue. */
cl_program program; /* OpenCL program. */
cl_kernel kernel; /* OpenCL kernel. */
cl_mem memObject_input; /* OpenCL memory buffer objects. */
cl_mem memObject_output; /* OpenCL memory buffer objects. */
cl_event eventFlops; /* OpenCL event. */
cl_event eventWrite;
long time0=0;
long time1=0;
size_t nSize=0;
char *buf;
size_t wg_size=0;
size_t wg_multiple=0;
size_t globalWorkSize[1] = {16};
size_t localWorkSize[1] = {1};
double flopCount=0;
double consumed_time_in_nano_seconds=0;
double consumed_time_in_seconds=0;
double flop=0;
double Gflop=0;
cl_long maxAllocSizeBytes = 0;
cl_long maxComputeUnits = 0;
cl_long maxWorkGroupSize = 0;
cl_ulong end=0;
cl_ulong start=0;
cl_ulong private_usage=0;
cl_ulong local_usage=0;
printf("Initializing OpenCL...\n");
/* Get the available platform. */
errNum = clGetPlatformIDs(1, &platform, NULL);
clmCHECKERROR(errNum, CL_SUCCESS);
/* Get a GPU device. */
errNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
clmCHECKERROR(errNum, CL_SUCCESS);
/* Create the context. */
context = clCreateContext(0, 1, &device, NULL, NULL, &errNum);
clmCHECKERROR(errNum, CL_SUCCESS);
/* Create a command-queue. */
commandQueue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &errNum);
clmCHECKERROR(errNum, CL_SUCCESS);
printf("Creating program...\n");
printf("Creating program with binary...\n");
nSize = kernel_binary_program_size;
buf = (char *)&kernel_binary_program;
program = clCreateProgramWithBinary(context, 1, &device, &nSize, (const unsigned char**)&buf, NULL, NULL);
if (program == NULL)
{
printf("program creation failed \n");
}
printf("Program created successfully! \n");
kernel = clCreateKernel (program, "flops", &errNum);
clmCHECKERROR(errNum, CL_SUCCESS);
printf("Kernel created successfully! \n");
// Identify maximum size of the global memory on the device side
clGetDeviceInfo (device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
sizeof(cl_long), &maxAllocSizeBytes, NULL);
clGetDeviceInfo (device, CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(cl_long), &maxComputeUnits, NULL);
clGetDeviceInfo (device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(cl_long), &maxWorkGroupSize, NULL);
printf("Max compute unit is %d \n", maxComputeUnits);
printf("Max Work Group size is %d \n", maxWorkGroupSize);
// Creating buffer on the device side
float *input_buffer =(float *) malloc (sizeof(float));
float *output_buffer =(float *) malloc (sizeof(float));
input_buffer[0] = 5.36;
output_buffer[0] = 0;
printf("Host buffer been prepared! \n");
memObject_input = clCreateBuffer (context, CL_MEM_READ_ONLY,
sizeof(float), NULL, &errNum);
clmCHECKERROR(errNum,CL_SUCCESS);
memObject_output = clCreateBuffer (context, CL_MEM_WRITE_ONLY,
sizeof(float), NULL, &errNum);
clmCHECKERROR(errNum,CL_SUCCESS);
if ((memObject_input == NULL) || (memObject_output == NULL)) {
printf ("Error creating memory objects \n");
return false;
}
errNum = clEnqueueWriteBuffer (commandQueue, memObject_input, CL_FALSE, 0, sizeof(float),
input_buffer, 0, NULL, &eventWrite);
clmCHECKERROR(errNum,CL_SUCCESS);
errNum = clWaitForEvents (1, &eventWrite);
clmCHECKERROR(errNum,CL_SUCCESS);
clReleaseEvent(eventWrite);
errNum = clEnqueueWriteBuffer (commandQueue, memObject_output, CL_FALSE, 0, sizeof(float),
output_buffer, 0, NULL, &eventWrite);
clmCHECKERROR(errNum,CL_SUCCESS);
errNum = clWaitForEvents (1, &eventWrite);
clmCHECKERROR(errNum,CL_SUCCESS);
clReleaseEvent(eventWrite);
errNum = clSetKernelArg (kernel, 0, sizeof(cl_mem), &memObject_input);
clmCHECKERROR(errNum, CL_SUCCESS);
errNum = clSetKernelArg (kernel, 1, sizeof(cl_mem), &memObject_output);
clmCHECKERROR(errNum, CL_SUCCESS);
errNum = clGetKernelWorkGroupInfo (kernel, device,
CL_KERNEL_WORK_GROUP_SIZE,
sizeof (wg_size), &wg_size, NULL);
clmCHECKERROR (errNum, CL_SUCCESS);
errNum = clGetKernelWorkGroupInfo (kernel, device,
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
sizeof (wg_multiple), &wg_multiple, NULL);
clmCHECKERROR (errNum, CL_SUCCESS);
errNum = clGetKernelWorkGroupInfo (kernel, device,
CL_KERNEL_LOCAL_MEM_SIZE,
sizeof (local_usage), &local_usage, NULL);
clmCHECKERROR (errNum, CL_SUCCESS);
errNum = clGetKernelWorkGroupInfo (kernel, device,
CL_KERNEL_PRIVATE_MEM_SIZE,
sizeof (private_usage), &private_usage, NULL);
clmCHECKERROR (errNum, CL_SUCCESS);
printf("Work Group size is %d \n",wg_size);
printf("Preferred Work Group size is %d \n",wg_multiple);
printf("Local memory size is %d \n",local_usage);
printf("Private memory size is %d \n",private_usage);
errNum = clEnqueueNDRangeKernel (commandQueue, kernel, 1, NULL,
globalWorkSize, localWorkSize,
0, NULL, &eventFlops);
clmCHECKERROR (errNum, CL_SUCCESS);
printf("Waiting for execution to finish \n");
errNum = clWaitForEvents(1, &eventFlops);
clmCHECKERROR(errNum, CL_SUCCESS);
errNum = clGetEventProfilingInfo(eventFlops, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, 0);
errNum |= clGetEventProfilingInfo(eventFlops, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, 0);
clmCHECKERROR(errNum, CL_SUCCESS);
printf("Kernel execution terminated successfully! \n");
flopCount = 62500000 * 16; // 16 work item
consumed_time_in_nano_seconds = (double)(end - start);
consumed_time_in_seconds = consumed_time_in_nano_seconds/1000000000;
printf("consumed_time_in_nano_seconds : %f \n",consumed_time_in_nano_seconds);
printf("consumed_time_in_seconds : %f \n",consumed_time_in_seconds);
flop = flopCount / consumed_time_in_seconds;
Gflop = flop / 1000000000;
printf("Floating Operations per second : %f \n",flop);
printf("Giga Floating Operations per second : %f \n",Gflop);
cleanup(context,program,kernel,memObject_input,memObject_output,eventFlops);
printf ("Program executed successfully! \n");
}
void cleanup (cl_context context,
cl_program program,
cl_kernel kernel,
cl_mem memObject_input,
cl_mem memObject_output,
cl_event eventFlops) {
if ( eventFlops != NULL)
clReleaseEvent(eventFlops);
if (memObject_input != NULL)
clReleaseMemObject (memObject_input);
if (memObject_output != NULL)
clReleaseMemObject (memObject_output);
if (kernel != NULL)
clReleaseKernel (kernel);
if (program != NULL)
clReleaseProgram (program);
}
void checkError(cl_int Value,cl_int Reference,const char* FileName,const
int LineNumber)
{
if (Reference != Value)
{
printf("\n !!! Error # %i at line %i , in file %s !!!\n\n",
Value, LineNumber, FileName);
printf("Exiting...\n");
exit(EXIT_FAILURE);
}
}
void gpu_intialize()
{
Error error;
printf("[GalCore] GalCore_TaskInit\n");
error = GalCore_TaskInit();
if (error == 0)
printf("[GalCore] GalCore_TaskInit successful \ne");
else
printf("[GalCore] GalCore_TaskInit Failed \n");
printf("[GalCore] VivanteInit\n");
VivanteInit();
printf("[GalCore] VivanteInit successful \n");
}
So i was wondering what might be the problem of that implementation