I am trying to code vector addition code using OpenMP in host and OpenMP Offloading. But time taken for OpenMP offloading is more than OpenMP in host. Why is that?
openmp-host.c
#include <assert.h>
#include <math.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char *argv[]) {
unsigned N = (argc > 1 ? atoi(argv[1]) : 1000000);
float *a = (float *)calloc(N, sizeof(float));
float *b = (float *)calloc(N, sizeof(float));
float *c = (float *)calloc(N, sizeof(float));
for (int i = 0; i < N; i++)
a[i] = i, b[i] = N - i;
#pragma omp parallel
{
unsigned thrds = omp_get_num_threads(), tid = omp_get_thread_num();
unsigned size = N / thrds, rem = N - size * thrds;
size += (tid < rem);
unsigned s = (tid < rem ? size * tid : (tid * size + rem)), e = s + size;
double t = omp_get_wtime();
for (unsigned i = s; i < e; i++){
c[i] = a[i] + b[i];
}
t = omp_get_wtime() - t;
if (tid == 0)
printf("N: %u # threads: %u time: %e\n", N, thrds, t);
}
for (unsigned i = 0; i < N; i++)
assert(fabs(c[i] - N) < 1e-8);
free(a);
return 0;
}
openmp-device.c
#include <assert.h>
#include <math.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char *argv[]) {
int N = (argc > 1 ? atoi(argv[1]) : 1000000);
double start, end;
int *a = (int *)calloc(N, sizeof(int));
int *b = (int *)calloc(N, sizeof(int));
int *c = (int *)calloc(N, sizeof(int));
double t;
for (int i = 0; i < N; i++) {
a[i] = i;
b[i] = N - i;
}
#pragma omp target enter data map(to:a[0:N],b[0:N], c[0:N])
t= omp_get_wtime();
#pragma omp target teams distribute parallel for simd
for(int i=0; i<N; i++){
c[i] = a[i] + b[i];
}
t = omp_get_wtime() - t;
#pragma omp target exit data map(from: c[0:N])
printf("time: %e \n", t);
for (int i = 0; i < N; i++)
assert(abs(c[i] - N) < 1e-8);
free(a);
free(b);
free(c);
return 0;
}
I used these 2 commands to compile and it works fine. I installed the oneAPI tool kit and levelZero also.
icx -qopenmp -fopenmp-targets=spir64 openmp-device.c -o omp_device
icx -qopenmp openmp-host.c -o omp_host
Why does openmp offloading take more time than openmp in host?