I'm trying to run the OpenACC tutorial at https://gcc.gnu.org/wiki/OpenACC#OpenACC_kernels_Construct_Optimization_Tutorial
The compiler is g++ 9.2 64-bit as part of the MSYS MINGW64 package.
C:\Users\TJ\Documents\GpuDemo>where g++
C:\msys64\mingw64\bin\g++.exe
C:\Users\TJ\Documents\GpuDemo>g++ --version
g++ (Rev2, Built by MSYS2 project) 9.2.0 Copyright (C) 2019 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
Here's the command that builds my code:
g++ -m64 -std=c++17 gpudemo.cpp -o gpudemo.exe -fopenmp -fopenacc
The single-thread and OpenMP multi-thread calls work fine. But the OpenACC code is not going to the GPU; it's running on the CPU. The GPU run time is the same as the single-thread run time. My computer is a Lenovo D20 with dual Intel Xeon 5675 processors (6 cores each) and an NVidia GeForce GTX 970 video card, running Windows 7 Pro SP1 64-bit.
Program output:
C:\Users\TJ\Documents\GpuDemo>gpudemo
Multiply a 2000x2000 matrix.
single thread: 54104.1 milliseconds
multi thread: 5036.29 milliseconds
GPU: 54371.1 milliseconds
If I set the environment variable ACC_DEVICE_TYPE=NVIDIA, it gives an error "libgomp: device type NVIDIA not supported."
How can I get this tutorial code to use the GPU?
// https://gcc.gnu.org/wiki/OpenACC
#include <iostream>
#include <chrono>
#define N 2000
void matrix_multiply_single_thread (float r[N][N], const float a[N][N], const float b[N][N])
{
for (int j = 0; j < N; j++)
{
for (int i = 0; i < N; i++)
{
float sum = 0;
for (int k = 0; k < N ; k++)
sum += a[i][k] * b[k][j];
r[i][j] = sum;
}
}
}
void matrix_multiply_multi_thread (float r[N][N], const float a[N][N], const float b[N][N])
{
#pragma omp parallel for
for (int j = 0; j < N; j++)
{
for (int i = 0; i < N; i++)
{
float sum = 0;
for (int k = 0; k < N ; k++)
sum += a[i][k] * b[k][j];
r[i][j] = sum;
}
}
}
void matrix_multiply_gpu (float r[N][N], const float a[N][N], const float b[N][N])
{
#pragma acc kernels \
copy(r[0:N][0:N], a[0:N][0:N], b[0:N][0:N])
{
#pragma acc loop independent
for (int j = 0; j < N; j++)
{
#pragma acc loop independent
for (int i = 0; i < N; i++)
{
float sum = 0;
// #pragma acc loop seq
#pragma acc loop independent reduction(+: sum)
for (int k = 0; k < N ; k++)
sum += a[i][k] * b[k][j];
r[i][j] = sum;
}
}
}
}
static float a[N][N], b[N][N], r[N][N];
int main()
{
std::cout << "Multiply a " << N << "x" << N << " matrix.\n\n";
srand(time(0));
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
a[i][j] = rand();
b[i][j] = rand();
}
}
auto start = std::chrono::high_resolution_clock::now();
matrix_multiply_single_thread(r, a, b);
auto finish = std::chrono::high_resolution_clock::now();
auto microseconds = std::chrono::duration_cast<std::chrono::microseconds>(finish - start);
double milliseconds = (double)microseconds.count() / 1000;
std::cout << "\nsingle thread: " << milliseconds << " milliseconds\n";
start = std::chrono::high_resolution_clock::now();
matrix_multiply_multi_thread(r, a, b);
finish = std::chrono::high_resolution_clock::now();
microseconds = std::chrono::duration_cast<std::chrono::microseconds>(finish - start);
milliseconds = (double)microseconds.count() / 1000;
std::cout << "multi thread: " << milliseconds << " milliseconds\n";
start = std::chrono::high_resolution_clock::now();
matrix_multiply_gpu(r, a, b);
finish = std::chrono::high_resolution_clock::now();
microseconds = std::chrono::duration_cast<std::chrono::microseconds>(finish - start);
milliseconds = (double)microseconds.count() / 1000;
std::cout << "GPU: " << milliseconds << " milliseconds\n";
return 0;
}