Measuring FLOPS on Vivante G7200L on IMX8

Question

Hello i'am new to Open Cl and i was trying to Measure GFLOPS on GPU Vivante7200 Lite on board IMX8

The output using this code is 0.127989 GFLOPS while the advertised FLOPS is 32 So what may be the wrong in this code.

Here is the kernel Code

__kernel void flops(__global float* input , __global float* output) {

  int gid = get_global_id(0);
  float scale = 2.35;
 float x = *input;
 int i=0;
 for(;i<62500000;++i)
 {
    scale = scale * (x);
  }
  output[gid] = scale;
}

and here is the Host Program

  #include "gpu_test.h"


 int  main ()
 {


gpu_intialize();

cl_int errNum;
cl_platform_id      platform;            /* OpenCL platform. */
cl_device_id        device;              /* OpenCL device. */
cl_context          context;             /* OpenCL context. */
cl_command_queue    commandQueue;        /* OpenCL command queue. */
cl_program          program;                 /* OpenCL program. */
cl_kernel           kernel;              /* OpenCL kernel. */
cl_mem              memObject_input;         /* OpenCL memory buffer objects. */
cl_mem              memObject_output;        /* OpenCL memory buffer objects. */
cl_event            eventFlops;          /* OpenCL event. */
cl_event        eventWrite;


long time0=0;
long time1=0;
size_t nSize=0;
char *buf;
size_t wg_size=0;
size_t wg_multiple=0;
size_t globalWorkSize[1] = {16};
size_t localWorkSize[1] = {1};

double flopCount=0;  
double consumed_time_in_nano_seconds=0;
double consumed_time_in_seconds=0;
double flop=0;
double Gflop=0;

cl_long maxAllocSizeBytes = 0;
cl_long maxComputeUnits = 0;
cl_long maxWorkGroupSize = 0;
cl_ulong end=0;
cl_ulong start=0;
cl_ulong private_usage=0;
cl_ulong local_usage=0;

printf("Initializing OpenCL...\n");

/* Get the available platform. */
errNum = clGetPlatformIDs(1, &platform, NULL);
clmCHECKERROR(errNum, CL_SUCCESS);

/* Get a GPU device. */
errNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
clmCHECKERROR(errNum, CL_SUCCESS);

/* Create the context. */
context = clCreateContext(0, 1, &device, NULL, NULL, &errNum);
clmCHECKERROR(errNum, CL_SUCCESS);

/* Create a command-queue. */
commandQueue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &errNum);
clmCHECKERROR(errNum, CL_SUCCESS);

printf("Creating program...\n");
printf("Creating program with binary...\n");

nSize = kernel_binary_program_size;
buf = (char *)&kernel_binary_program;

program = clCreateProgramWithBinary(context, 1, &device, &nSize, (const unsigned char**)&buf, NULL, NULL);
if (program == NULL) 
{
    printf("program creation failed \n");
}

printf("Program created successfully! \n");

kernel = clCreateKernel (program, "flops", &errNum);
clmCHECKERROR(errNum, CL_SUCCESS);

printf("Kernel created successfully! \n");


// Identify maximum size of the global memory on the device side


clGetDeviceInfo (device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
                     sizeof(cl_long), &maxAllocSizeBytes, NULL);
clGetDeviceInfo (device, CL_DEVICE_MAX_COMPUTE_UNITS,
                     sizeof(cl_long), &maxComputeUnits, NULL);
clGetDeviceInfo (device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
                     sizeof(cl_long), &maxWorkGroupSize, NULL);


printf("Max compute unit is %d \n", maxComputeUnits);
printf("Max Work Group size is %d \n", maxWorkGroupSize);



// Creating buffer on the device side



float *input_buffer =(float *) malloc (sizeof(float));
float *output_buffer =(float *) malloc (sizeof(float));

input_buffer[0] = 5.36;
output_buffer[0] = 0;

printf("Host buffer been prepared! \n");

memObject_input = clCreateBuffer (context, CL_MEM_READ_ONLY,
                            sizeof(float), NULL, &errNum);
clmCHECKERROR(errNum,CL_SUCCESS);


memObject_output = clCreateBuffer (context, CL_MEM_WRITE_ONLY,
                            sizeof(float), NULL, &errNum);
clmCHECKERROR(errNum,CL_SUCCESS);

if ((memObject_input == NULL) || (memObject_output == NULL)) {
    printf ("Error creating memory objects \n");
    return false;
}

errNum = clEnqueueWriteBuffer (commandQueue, memObject_input, CL_FALSE, 0,  sizeof(float),
            input_buffer, 0, NULL, &eventWrite);
clmCHECKERROR(errNum,CL_SUCCESS);
    errNum = clWaitForEvents (1, &eventWrite);
    clmCHECKERROR(errNum,CL_SUCCESS);
    clReleaseEvent(eventWrite);

errNum = clEnqueueWriteBuffer (commandQueue, memObject_output, CL_FALSE, 0,  sizeof(float),
                output_buffer, 0, NULL, &eventWrite);
clmCHECKERROR(errNum,CL_SUCCESS);
errNum = clWaitForEvents (1, &eventWrite);
clmCHECKERROR(errNum,CL_SUCCESS);
clReleaseEvent(eventWrite);


errNum = clSetKernelArg (kernel, 0, sizeof(cl_mem), &memObject_input);
    clmCHECKERROR(errNum, CL_SUCCESS);

errNum = clSetKernelArg (kernel, 1, sizeof(cl_mem), &memObject_output);
    clmCHECKERROR(errNum, CL_SUCCESS);



errNum = clGetKernelWorkGroupInfo (kernel, device,
                                   CL_KERNEL_WORK_GROUP_SIZE,
                                   sizeof (wg_size), &wg_size, NULL);
    clmCHECKERROR (errNum, CL_SUCCESS);

errNum = clGetKernelWorkGroupInfo (kernel, device,
                                   CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
                                  sizeof (wg_multiple), &wg_multiple, NULL);
clmCHECKERROR (errNum, CL_SUCCESS);

errNum = clGetKernelWorkGroupInfo (kernel, device,
                                   CL_KERNEL_LOCAL_MEM_SIZE,
                                   sizeof (local_usage), &local_usage, NULL);
clmCHECKERROR (errNum, CL_SUCCESS);

errNum = clGetKernelWorkGroupInfo (kernel, device,
                                   CL_KERNEL_PRIVATE_MEM_SIZE,
                                   sizeof (private_usage), &private_usage, NULL);
clmCHECKERROR (errNum, CL_SUCCESS);


printf("Work Group size is %d \n",wg_size); 
printf("Preferred Work Group size is %d \n",wg_multiple);
printf("Local memory size is %d \n",local_usage);
printf("Private memory size is %d \n",private_usage);       


errNum = clEnqueueNDRangeKernel (commandQueue, kernel, 1, NULL,
                                globalWorkSize, localWorkSize,
                                0, NULL, &eventFlops);
         clmCHECKERROR (errNum, CL_SUCCESS);

printf("Waiting for execution to finish \n");
errNum = clWaitForEvents(1, &eventFlops);
clmCHECKERROR(errNum, CL_SUCCESS);

errNum = clGetEventProfilingInfo(eventFlops, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, 0);
errNum |= clGetEventProfilingInfo(eventFlops, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, 0);
clmCHECKERROR(errNum, CL_SUCCESS);

printf("Kernel execution terminated successfully! \n");

flopCount = 62500000  * 16; // 16 work item
consumed_time_in_nano_seconds = (double)(end - start);
consumed_time_in_seconds = consumed_time_in_nano_seconds/1000000000;

printf("consumed_time_in_nano_seconds : %f \n",consumed_time_in_nano_seconds);
printf("consumed_time_in_seconds : %f \n",consumed_time_in_seconds);

flop = flopCount / consumed_time_in_seconds;
Gflop = flop / 1000000000;

printf("Floating Operations per second : %f \n",flop);
printf("Giga Floating Operations per second : %f \n",Gflop);

cleanup(context,program,kernel,memObject_input,memObject_output,eventFlops);
printf ("Program executed successfully! \n");
 }


void cleanup (cl_context context,
      cl_program program,
      cl_kernel kernel, 
      cl_mem memObject_input, 
      cl_mem memObject_output,
      cl_event eventFlops) {

 if (   eventFlops != NULL)
 clReleaseEvent(eventFlops);

 if (memObject_input != NULL)
 clReleaseMemObject (memObject_input);

 if (memObject_output != NULL)
 clReleaseMemObject (memObject_output);

 if (kernel != NULL)
 clReleaseKernel (kernel);

 if (program != NULL)
 clReleaseProgram (program);

  }



void checkError(cl_int Value,cl_int Reference,const char* FileName,const 
int LineNumber)
{
   if (Reference != Value)
  {
    printf("\n !!! Error # %i at line %i , in file %s !!!\n\n",
                 Value, LineNumber, FileName);

     printf("Exiting...\n");
      exit(EXIT_FAILURE);
  }
}  

 void gpu_intialize()
 {

   Error error;
   printf("[GalCore] GalCore_TaskInit\n");
   error = GalCore_TaskInit();

    if (error == 0) 
    printf("[GalCore] GalCore_TaskInit successful \ne");
   else
    printf("[GalCore] GalCore_TaskInit Failed \n"); 

  printf("[GalCore] VivanteInit\n");
   VivanteInit();
    printf("[GalCore] VivanteInit successful \n");
    }

So i was wondering what might be the problem of that implementation

score 1 · Answer 1 · answered Jul 16 '19 at 16:45

in no particular order:

do not use host timers (getTime()) to measure performance of on-device code. Use clGetEventProfilingInfo() on the eventFlops.
you need to release the eventFlops event to avoid memleaks
The 32GFlops theoretical performance is with heavily arithmetic code. Your code does one Flop per 1 load + 1 store, IOW it's limited by memory. If you wanted such kernel to run at 32GFlops, you'd need 2x 128GB/s memory bandwidth.
many of the low-power chip GPUs have vectorized units, and for best performance the code must use vectors (e.g. float4). Sometimes the compiler manages to vectorize the code, but quite often it doesn't.

Measuring FLOPS on Vivante G7200L on IMX8

1 Answers1