1

When I enable OpenACC during compilation using GCC, is there a reason why my code is slower? I'm currently using GCC 6.3.0, on Windows 10. I'm really unsure why this is happening.

This is the command I'm compiling with: g++ -fopenacc -o a Example.cpp

And here is my C++ code:

#include <stdlib.h>
#include <cassert>
#include <chrono>


double *A, *B, *C;


int main(int argc, char* argv[]) {

    long long N = 100;

    A = new double[N * N];
    B = new double[N * N];
    C = new double[N * N];

    srand(42);

    for (int i = 0; i < N; i++) {

        for (int j = 0; j < N; j++) {
            A[i * N + j] = rand();
            B[i * N + j] = rand();
        }
    }

    
    for (int x = 0; x < 10; x++) {
        auto start_time = std::chrono::high_resolution_clock::now();
        #pragma acc kernels 
        {
            #pragma acc loop independent
            for (int i = 0; i < N; i++) {
                #pragma acc loop independent
                for (int j = 0; j < N; j++) {
                    double total = 0;
                    #pragma acc loop independent reduction (+: total)
                    for (int k = 0; k < N; k++) {
                        total += A[i * N + j] * B[k * N + j];
                    }
                    C[i * N + j] = total;
                }
            }
        }
        


        auto end_time = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> duration = end_time - start_time;
        printf("%f seconds\n", duration.count());
    
    }

    

    return 0;
}
newang
  • 179
  • 2
  • 2
  • 7

1 Answers1

3

Not sure GNU 6.3 supported OpenACC, at least not well, nor do I know if it was supported on Windows. I'm using GNU 10.2 on Linux, where OpenACC support is much better.

However, GNU still doesn't handle the "kernels" directive well so I'd suggest using parallel instead. Also, you're missing a data region so you'd get a runtime error if this code was offloaded.

For example:

% g++ --version
g++ (GCC) 10.2.0
Copyright (C) 2020 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
%
% cat mm.cpp
#include <stdlib.h>
#include <stdio.h>
#include <cassert>
#include <chrono>

#ifdef USE_PARALLEL
#define ACC_TYPE parallel
#else
#define ACC_TYPE kernels
#endif

double *A, *B, *C;


int main(int argc, char* argv[]) {

    long long N = 100;

    A = new double[N * N];
    B = new double[N * N];
    C = new double[N * N];

    srand(42);

    for (int i = 0; i < N; i++) {

        for (int j = 0; j < N; j++) {
            A[i * N + j] = rand();
            B[i * N + j] = rand();
        }
    }

    #pragma acc data copyin(A[:N*N],B[:N*N]) copyout(C[:N*N])
    for (int x = 0; x < 10; x++) {
        auto start_time = std::chrono::high_resolution_clock::now();
        #pragma acc ACC_TYPE
        {
            #pragma acc loop independent
            for (int i = 0; i < N; i++) {
                #pragma acc loop independent
                for (int j = 0; j < N; j++) {
                    double total = 0;
                    #pragma acc loop independent reduction (+: total)
                    for (int k = 0; k < N; k++) {
                        total += A[i * N + j] * B[k * N + j];
                    }
                    C[i * N + j] = total;
                }
            }
        }
        auto end_time = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double> duration = end_time - start_time;
        printf("%f seconds\n", duration.count());
    }
    return 0;
}
% g++ -fopenacc mm.cpp -Ofast ; a.out
0.020057 seconds
0.020025 seconds
0.020022 seconds
0.020021 seconds
0.019538 seconds
0.018271 seconds
0.018270 seconds
0.018264 seconds
0.018274 seconds
0.018270 seconds
% g++ -fopenacc mm.cpp -Ofast -DUSE_PARALLEL ; a.out
0.000123 seconds
0.000086 seconds
0.000081 seconds
0.000078 seconds
0.000078 seconds
0.000077 seconds
0.000077 seconds
0.000076 seconds
0.000076 seconds
0.000076 seconds
Mat Colgrove
  • 5,441
  • 1
  • 10
  • 11