0

I am trying to code vector addition code using OpenMP in host and OpenMP Offloading. But time taken for OpenMP offloading is more than OpenMP in host. Why is that?

openmp-host.c

#include <assert.h>
#include <math.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>

int main(int argc, char *argv[]) {
  unsigned N = (argc > 1 ? atoi(argv[1]) : 1000000);

  float *a = (float *)calloc(N, sizeof(float));
  float *b = (float *)calloc(N, sizeof(float));
  float *c = (float *)calloc(N, sizeof(float));
  for (int i = 0; i < N; i++)
    a[i] = i, b[i] = N - i;

#pragma omp parallel
  {
    unsigned thrds = omp_get_num_threads(), tid = omp_get_thread_num();
    unsigned size = N / thrds, rem = N - size * thrds;
    size += (tid < rem);
    unsigned s = (tid < rem ? size * tid : (tid * size + rem)), e = s + size;

    double t = omp_get_wtime();
    
      for (unsigned i = s; i < e; i++){
        c[i] = a[i] + b[i];
      }
        
    t = omp_get_wtime() - t;

    if (tid == 0)
      printf("N: %u # threads: %u time: %e\n", N, thrds, t);
  }

  for (unsigned i = 0; i < N; i++)
    assert(fabs(c[i] - N) < 1e-8);

  free(a);

  return 0;
}

openmp-device.c

#include <assert.h>
#include <math.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>

int main(int argc, char *argv[]) {
  int N = (argc > 1 ? atoi(argv[1]) : 1000000);
  double start, end;
  int *a = (int *)calloc(N, sizeof(int));
  int *b = (int *)calloc(N, sizeof(int));
  int *c = (int *)calloc(N, sizeof(int));
  double t;
  for (int i = 0; i < N; i++) {
    a[i] = i;
    b[i] = N - i;
  }

#pragma omp target enter data map(to:a[0:N],b[0:N], c[0:N])

t= omp_get_wtime();
#pragma omp target teams distribute parallel for simd
for(int i=0; i<N; i++){
  c[i] = a[i] + b[i];
}

t = omp_get_wtime() - t;

#pragma omp target exit data map(from: c[0:N])

  
  printf("time: %e \n", t);

  for (int i = 0; i < N; i++)
    assert(abs(c[i] - N) < 1e-8);

  free(a);
  free(b);
  free(c);

  return 0;
}

I used these 2 commands to compile and it works fine. I installed the oneAPI tool kit and levelZero also. icx -qopenmp -fopenmp-targets=spir64 openmp-device.c -o omp_device icx -qopenmp openmp-host.c -o omp_host

Why does openmp offloading take more time than openmp in host?

  • 2
    Offloading involves copying the input data to the device and copying back the result. Since the calculations you are doing are extremely simple, the time needed to copy the data rules the total runtime. – PierU Nov 16 '22 at 17:25
  • Thank your response. Is there a better way to solve this problem?. (time difference is just like offloading time = openmp cpu * 10) – Ravindu Hirimuthugoda Nov 16 '22 at 17:37
  • There are programming techniques on GPU to transfer data and perform calculations simultaneously. I don't know if it's possible with OpenMP (I guess it is, but I have not a good enough knowledge). – PierU Nov 16 '22 at 18:09

0 Answers0