zero output when OpenACC is used

Question

I use PGI community edition 17.10 to compile and run fallowing code. why the output is wrong when I add directives of OpenACC? may you if help me why it's happen? Thanks in advance, sajad

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <assert.h>
#include <openacc.h>
#include<time.h>
#include <string.h>
#include <malloc.h>
#include <cuda_runtime_api.h>

#define NX 2
#define NY 2
#define NZ 2
int main(void)
{
static int  i, j, k;
static double A[NX][NY][NZ]=2 ,B[NX][NY][NZ]=10.,C[NX]=10.,D[NY]=10.,E[NZ]=10.;

FILE *file;
file = fopen("BB-and-A.csv", "w");
#pragma acc  data copy( A ,B,C,D,E,i, j, k)
      {
#pragma acc   kernels loop private(i, j, k)

            for (i = 0; i <= NX; i++) {
                for (j =0; j <= NY ; j++) {
                    for (k =0; k <= NZ ; k++) {
                            C[i]=i;
                            D[j]=j;
                            E[k]=k;
                    }
                }
            }
}
    for (i = 0; i <= NX; i++) {
                for (j =0; j <= NY ; j++) {
                    for (k =0; k <= NZ ; k++) {
                        fprintf(file, "%e, %e, %e \n", C[i], D[j],E[k] );
                    }
                }
    }
fclose(file);
}

score 2 · Accepted Answer · answered Mar 18 '18 at 22:05

You have a number of issues with this code.

1) Your array bounds are incorrect. Since the loops go from 1 to <= N but the arrays only have N members, you're writing off the end of the array.

2) Your loop isn't parallelizable since you're writing to each element from multiple loop iterations. To fix, I'd make these three separate loops.

3) The loop index variables shouldn't be made static. This puts them in global storage and thus causes a dependency. While you can fix this by putting them in private clause, it's better to remove the static and let the compiler implicitly privatize them.

4) No need to copy the loop index variables.

Try something like the following:

% cat test2.c 
 #include <stdio.h> 
  #include <math.h> 
  #include <stdlib.h> 
  #include <assert.h> 
  #include <openacc.h> 
  #include<time.h> 
  #include <string.h> 
  #include <malloc.h> 
 // #include <cuda_runtime_api.h> 

  #define NX 2 
  #define NY 2 
  #define NZ 2 


  int main(void) 
  { 
  int i, j, k; 

  static double A[NX+1][NY+1][NZ+1]=2 ,B[NX+1][NY+1][NZ+1]=10.,C[NX+1]=10.,D[NY+1]=10.,E[NZ+1]=10.; 
  FILE *file; 
  file = fopen("BB-and-A.csv", "w"); 
  #pragma acc data copy(A,B,C,D,E) 
  { 
  #pragma acc kernels 
  { 
  for (i = 0; i <= NX; i++) C[i]=i; 
  for (j =0; j <= NY ; j++) D[j]=j; 
  for (k =0; k <= NZ ; k++) E[k]=k; 
  } } 
  for (i = 0; i <= NX; i++) { 
  for (j =0; j <= NY ; j++) { 
  for (k =0; k <= NZ ; k++) { 
    fprintf(file,"%e, %e, %e \n", C[i], D[j],E[k] ); 
  } } } 

  fclose(file); 
  } 
 % pgcc test2.c -ta=tesla:cc60 -Minfo=accel 
 main: 
      23, Generating copy(A[:][:][:],B[:][:][:],C[:],E[:],D[:]) 
      27, Loop is parallelizable 
          Accelerator kernel generated 
          Generating Tesla code 
          27, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ 
      28, Loop is parallelizable 
          Accelerator kernel generated 
          Generating Tesla code 
          28, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ 
      29, Loop is parallelizable 
          Accelerator kernel generated 
          Generating Tesla code 
          29, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */ 
 % a.out 
 % cat BB-and-A.csv 
 0.000000e+00, 0.000000e+00, 0.000000e+00 
 0.000000e+00, 0.000000e+00, 1.000000e+00 
 0.000000e+00, 0.000000e+00, 2.000000e+00 
 0.000000e+00, 1.000000e+00, 0.000000e+00 
 0.000000e+00, 1.000000e+00, 1.000000e+00 
 0.000000e+00, 1.000000e+00, 2.000000e+00 
 0.000000e+00, 2.000000e+00, 0.000000e+00 
 0.000000e+00, 2.000000e+00, 1.000000e+00 
 0.000000e+00, 2.000000e+00, 2.000000e+00 
 1.000000e+00, 0.000000e+00, 0.000000e+00 
 1.000000e+00, 0.000000e+00, 1.000000e+00 
 1.000000e+00, 0.000000e+00, 2.000000e+00 
 1.000000e+00, 1.000000e+00, 0.000000e+00 
 1.000000e+00, 1.000000e+00, 1.000000e+00 
 1.000000e+00, 1.000000e+00, 2.000000e+00 
 1.000000e+00, 2.000000e+00, 0.000000e+00 
 1.000000e+00, 2.000000e+00, 1.000000e+00 
 1.000000e+00, 2.000000e+00, 2.000000e+00 
 2.000000e+00, 0.000000e+00, 0.000000e+00 
 2.000000e+00, 0.000000e+00, 1.000000e+00 
 2.000000e+00, 0.000000e+00, 2.000000e+00 
 2.000000e+00, 1.000000e+00, 0.000000e+00 
 2.000000e+00, 1.000000e+00, 1.000000e+00 
 2.000000e+00, 1.000000e+00, 2.000000e+00 
 2.000000e+00, 2.000000e+00, 0.000000e+00 
 2.000000e+00, 2.000000e+00, 1.000000e+00 
 2.000000e+00, 2.000000e+00, 2.000000e+00

zero output when OpenACC is used

1 Answers1