I am very new to openacc and have just high-level knowledge so any help and explanation of what I am doing wrong would be appreciated.
I am trying to accelerate(parallelize) a not so straightforward nested loop that updates a flattened (3D to 1D) array using openacc directives. I have posted a simplified sample code below that when compiled using
pgcc -acc -Minfo=accel test.c
gives the following error:
call to cuStreamSynchronize returned error 700: Illegal address during kernel execution
Code:
#include <stdio.h>
#include <stdlib.h>
#define min(a,b) (a > b) ? b : a
#define max(a,b) (a < b) ? b : a
#define NX 10
#define NY 10
#define NZ 10
struct phiType {
double dx, dy, dz;
double * distance;
};
typedef struct phiType Phi;
#pragma acc routine seq
double solve(Phi *p, int index) {
// for simplicity just returning a value
return 2;
}
void fast_sweep(Phi *p) {
// removing boundaries
int x = NX - 2;
int y = NY - 2;
int z = NZ - 2;
int startLevel = 3;
int endLevel = x + y + z;
#pragma acc data copy(p->distance[0:NX*NY*NZ])
for(int level = startLevel; level <= endLevel; level++){
int ks = max(1, level-(y + z));
int ke = min(x, level-2);
int js = max(1, level-(x + z));
int je = min(y, level-2);
#pragma acc region
{
#pragma acc loop independent
for(int k = ks; k <= ke; k++){
#pragma acc loop independent
for(int j = js; j <= je; j++){
int i = level - (k + j);
if(i > 0 && i <= z){
int index = i * NX * NY + j * NX + k;
p->distance[index] = solve(p, index);
}
}
}
}
}
}
void create_phi(Phi *p){
p->dx = 1;
p->dy = 1;
p->dz = 1;
p->distance = (double *) malloc(sizeof(double) * NX * NY * NZ);
for(int i = 0; i < NZ; i++){
for(int j = 0; j < NY; j++){
for(int k = 0; k < NX; k++){
int index = i * NX * NY + j * NX + k;
p->distance[index] = (i*j*k == 0) ? 0 : 1;
}
}
}
}
int main()
{
printf("start \n");
Phi *p = (Phi *) malloc(sizeof(Phi));
create_phi(p);
printf("calling fast sweep \n");
fast_sweep(p);
printf(" print the results \n");
for(int i = 0; i < NZ; i++){
for(int j = 0; j < NY; j++){
for(int k = 0; k < NX; k++){
int index = i * NX * NY + j * NX + k;
printf("%f ", p->distance[index]);
}
printf("\n");
}
printf("\n");
}
return 0;
}
Instead of using the region
and loop
directives, using the
#pragma acc kernels
produces the following error:
solve:
19, Generating acc routine seq
fast_sweep:
34, Generating copy(p->distance[:1000])
42, Generating copy(p[:1])
45, Loop carried dependence due to exposed use of p[:1] prevents parallelization
Accelerator scalar kernel generated
47, Loop carried dependence due to exposed use of p[:i1+1] prevents parallelization
I am running this code on
GNU/Linux
CentOS release 6.7 (Final)
GeForce GTX Titan
pgcc 15.7-0 64-bit target on x86-64 Linux -tp sandybridge