How can I check I am actually using GPU parallelization with Openacc(or OpenMP)?

Question

I am trying to use GPU parallelization with Openacc. (I am using GCC in windows)

g++ -fopenacc -o D:\Dropbox\cpp_files\Practice_openmp_denser\testing.exe D:\Dropbox\cpp_files\Practice_openmp_denser\temp.cpp

Then I could run testing.exe but I am not sure whether Openacc actually works with my code or not because I don't see any change in speed.

So I was wondering if there is any command that I can check whether Openacc is working.

Any help? Thanks in advance.

I put my code just in case.

#include <iostream>
#include <cstdio>
#include <chrono>
#include <vector>
#include <math.h>       // power
#include <cmath>        // abs
#include <fstream>
#include <omp.h>

using namespace std;
using namespace chrono;

// Dynamically allocation with values(float)
void dallo_fn(float**** pMat, int Na, int Nd, int Ny) {
    float*** Mat = new float** [Na];
    for (int i = 0; i < Na; i++) {
        Mat[i] = new float* [Nd];
        for (int j = 0; j < Nd; j++) {
            Mat[i][j] = new float[Ny];
            fill_n(Mat[i][j], Ny, 1);
        }
    }
    *pMat = Mat;
}

// Dynamically allocation without values(float)
void dallo_fn0(float**** pMat, int Na, int Nd, int Ny) {
    float*** Mat = new float** [Na];
    for (int i = 0; i < Na; i++) {
        Mat[i] = new float* [Nd];
        for (int j = 0; j < Nd; j++) {
            Mat[i][j] = new float[Ny];
        }
    }
    *pMat = Mat;
}

// Dynamically allocation without values(int)
void dallo_fn1(int**** pMat, int Na, int Nd, int Ny) {
    int*** Mat = new int** [Na];
    for (int i = 0; i < Na; i++) {
        Mat[i] = new int* [Nd];
        for (int j = 0; j < Nd; j++) {
            Mat[i][j] = new int[Ny];
        }
    }
    *pMat = Mat;
}

// Utility function
float utility(float a, float a_f, float d, float d_f, float y, double sig, double psi, double delta, double R) {
    float C;
    C = y + a - a_f / R - (d_f - (1 - delta) * d);
    float result;
    if (C > 0) {
        result = 1 / (1 - 1 / sig) * pow(pow(C, psi) * pow(d_f, 1 - psi), (1 - 1 / sig));
    }
    else {
        result = -999999;
    }
    return result;
}


int main()
{
    
#if defined _OPENMP
    omp_set_num_threads(8);
#endif

    float duration;

    // Iteration Parameters
    double tol = 0.000001;
    int itmax = 200;
    int H = 15;

    // Model Parameters and utility function
    double sig = 0.75;
    double beta = 0.95;
    double psi = 0.5;
    double delta = 0.1;
    double R = 1 / beta - 0.00215;

    // =============== 2. Discretizing the state space =========================

    // Size of arrays
    const int Na = 1 * 91;
    const int Nd = 1 * 71;
    const int Ny = 3;

    // Variables for discretization of state space
    const float amin = -2;
    const float amax = 7;
    const float dmin = 0.01;
    const float dmax = 7;
    const float ymin = 0.5;
    const float ymax = 1.5;
    const float Ptrans[3] = { 0.2, 0.6, 0.2 };

    // Discretization of state space
    float ca = (amax - amin) / (Na - 1.0);
    float cd = (dmax - dmin) / (Nd - 1.0);
    float cy = (ymax - ymin) / (Ny - 1.0);

    float* A = new float[Na];
    float* Y = new float[Ny];
    float* D = new float[Nd];

    for (int i = 0; i < Na; i++) {
        A[i] = amin + i * ca;
    }
    for (int i = 0; i < Nd; i++) {
        D[i] = dmin + i * cd;
    }
    for (int i = 0; i < Ny; i++) {
        Y[i] = ymin + i * cy;
    }

    // === 3. Initial guesses, Variable initialization and Transition matrix ===

    // Initial guess for value function
    float*** V;
    dallo_fn(&V, Na, Nd, Ny);
    float*** Vnew;
    dallo_fn(&Vnew, Na, Nd, Ny);

    // Initialization of other variables
    float Val[Na][Nd];
    float** Vfuture = new float* [Na];
    for (int i = 0; i < Na; i++)
    {
        Vfuture[i] = new float[Nd];
    }
    float** temphoward = new float* [Na];
    for (int i = 0; i < Na; i++)
    {
        temphoward[i] = new float[Nd];
    }

    float*** Vhoward;
    dallo_fn0(&Vhoward, Na, Nd, Ny);
    float*** tempdiff;
    dallo_fn0(&tempdiff, Na, Nd, Ny);
    int*** maxposition_a;
    dallo_fn1(&maxposition_a, Na, Nd, Ny);
    int*** maxposition_d;
    dallo_fn1(&maxposition_d, Na, Nd, Ny);

    float** mg_A_v = new float* [Na];
    for (int i = 0; i < Na; i++)
    {
        mg_A_v[i] = new float[Nd];
    }
    for (int j = 0; j < Nd; j++) {
        for (int i = 0; i < Na; i++) {
            mg_A_v[i][j] = A[i];
        }
    }

    float** mg_D_v = new float* [Na];
    for (int i = 0; i < Na; i++)
    {
        mg_D_v[i] = new float[Nd];
    }
    for (int j = 0; j < Nd; j++) {
        for (int i = 0; i < Na; i++) {
            mg_D_v[i][j] = D[j];
        }
    }

    float***** Uvec = new float**** [Na];
    for (int i = 0; i < Na; i++) {
        Uvec[i] = new float*** [Nd];
        for (int j = 0; j < Nd; j++) {
            Uvec[i][j] = new float** [Ny];
            for (int k = 0; k < Ny; k++) {
                Uvec[i][j][k] = new float* [Na];
                for (int l = 0; l < Na; l++) {
                    Uvec[i][j][k][l] = new float[Nd];
                }
            }
        }
    }

    for (int i = 0; i < Na; i++) {
        for (int j = 0; j < Nd; j++) {
            for (int k = 0; k < Ny; k++) {
                for (int l = 0; l < Na; l++) {
                    for (int m = 0; m < Nd; m++) {
                        Uvec[i][j][k][l][m] = utility(A[i], mg_A_v[l][m], D[j], mg_D_v[l][m], Y[k], sig, psi, delta, R);
                    }
                }
            }
        }
    }

    // Value function iteration
    int it;
    float dif;
    float max;
    it = 0;
    dif = 1;

    // ================ 4. Value function iteration ============================
    #pragma acc enter data copyin(Vnew, Ptrans) create(Vfuture, V, maxposition_a, maxposition_d, Vhoward)
    while (dif >= tol && it <= itmax) {
        system_clock::time_point start = system_clock::now();
        it = it + 1;
        // V = Vnew;
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                for (int k = 0; k < Ny; k++) {
                    V[i][j][k] = Vnew[i][j][k];
                }
            }
        }

        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                Vfuture[i][j] = 0;
                for (int k = 0; k < Ny; k++) {
                    Vfuture[i][j] += beta * Ptrans[k] * Vnew[i][j][k]; // + beta * Ptrans[1] * Vnew[i][j][1] + beta * Ptrans[2] * Vnew[i][j][2]; // Why is this different from Vfuture[i][j] += beta * Vnew[i][j][k] * Ptrans[k]; with for k
                }
            }
        }
        
        #pragma acc parallel loop
        //#pragma omp target teams distribute parallel for private(Val)
        for (int a = 0; a < Na; a++) {
            for (int b = 0; b < Nd; b++) {
                for (int c = 0; c < Ny; c++) {
                    max = -99999;
                    for (int d = 0; d < Na; d++) {
                        for (int e = 0; e < Nd; e++) {
                            Val[d][e] = Uvec[a][b][c][d][e] + Vfuture[d][e];
                            if (max < Val[d][e]) {
                                max = Val[d][e];
                                maxposition_a[a][b][c] = d;
                                maxposition_d[a][b][c] = e;
                            }
                        }
                    }
                    Vnew[a][b][c] = max;
                }
            }
        }

        // Howard improvement
        for (int h = 0; h < H; h++) {
            for (int i = 0; i < Na; i++) {
                for (int j = 0; j < Nd; j++) {
                    for (int k = 0; k < Ny; k++) {
                        Vhoward[i][j][k] = Vnew[i][j][k];
                    }
                }
            }

            for (int i = 0; i < Na; i++) {
                for (int j = 0; j < Nd; j++) {
                    for (int k = 0; k < Ny; k++) {
                        temphoward[i][j] = beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][0] * Ptrans[0]
                            + beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][1] * Ptrans[1]
                            + beta * Vhoward[maxposition_a[i][j][k]][maxposition_d[i][j][k]][2] * Ptrans[2];
                        Vnew[i][j][k] = temphoward[i][j] + Uvec[i][j][k][maxposition_a[i][j][k]][maxposition_d[i][j][k]];
                    }
                }
            }
        }


        // Calculate Diff
        dif = -100000;
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                for (int k = 0; k < Ny; k++) {
                    tempdiff[i][j][k] = abs(V[i][j][k] - Vnew[i][j][k]);
                    if (tempdiff[i][j][k] > dif) {
                        dif = tempdiff[i][j][k];
                    }
                }
            }
        }

        system_clock::time_point end = system_clock::now();
        std::chrono::duration<float> sec = end - start;


        cout << dif << endl;
        cout << it << endl;
        cout << sec.count() << endl;
    }
    
    /*
    for (int k = 0; k < Ny; k++) {
        for (int i = 0; i < Na; i++) {
            for (int j = 0; j < Nd; j++) {
                cout << Vnew[i][j][k];
            }
            cout << '\n';
        }
    }
    */
//    cout << omp_get_max_threads() << endl;

}

After I run the code, I don't see any GPU usage increase. So I guess I am not using GPU parallelization at all... — Changseok Ma, Mar 08 '21 at 23:09
Take an example from the documentation and get that to work then try adding the same to your current code and see it change the GPU usage. On the other hand if the GPU usage is zero, your PC might not register usage correctly. — Surt, Mar 09 '21 at 09:36
@Surt: Thanks for the comments. I actually found a sample code online using OpenACC https://stackoverflow.com/questions/66552182/how-to-offload-to-the-gpu-with-openacc-in-windows But still, I don't see any change in GPU usage. Is there any prerequisite in order to use OpenACC in Windows? — Changseok Ma, Mar 09 '21 at 17:50

score 0 · Answer 1 · answered Aug 20 '21 at 18:20

I have never used OpenACC on windows, so I don't know how to run GPU code on Windows using OpenACC.

To test whether your code is running on GPU, I believe the following snippet will help:

int main() {
    int NUM = 1000;
    int buf[NUM] = {};    // initialize the buffer to 0s
#pragma acc parallel create(buf[NUM])
    for (int i = 0; i < NUM; ++i) {
        buf[i] = 1;
    }
    if (buf[0] == 1) {
        printf("Running on CPU\n");
    } else {
        printf("Running on GPU\n");
    }
}

In this snippet code, if the code is running on GPU, the buf will not be copied back to the host, so on the host, buf[0] is still 0, and on GPU, its value is set to 1.

If the code runs on CPU, the buf[0] will be changed to 1.

Using the command line under Linux with NVHPC compiler, we can find whether the code is running on GPU using nvidia-smi command when it's running for NVIDIA GPUs.

Also, if there are data transfers in your code ( mostly it should have ), set NV_ACC_TIME=1 should print out a data trace.

During compiling time, for the NVHPC compilers, we can use -acc -Minfo=accel to see whether it shows generating Tesla code somewhere, which means a GPU code is generated.

However, I'm not sure ways on Windows to test except using the above code snippet.

How can I check I am actually using GPU parallelization with Openacc(or OpenMP)?

1 Answers1