For a small benchmark of OpenMP on an i7-6700K I wrote the following code:
#include <iostream>
#include <omp.h>
#include <vector>
#include <chrono>
constexpr int bench_rounds = 32;
int main(void) {
using std::chrono::high_resolution_clock;
using std::chrono::duration_cast;
using std::chrono::duration;
using std::chrono::milliseconds;
size_t vec_size = 16;
size_t large_vec_size = 1024*16;
std::vector<double> test_vec(large_vec_size * large_vec_size, 0);
auto t1 = high_resolution_clock::now();
for(int k = 0; k < bench_rounds; ++k) {
#pragma omp parallel for collapse(2)
for(int j = 0; j < large_vec_size; ++j) {
for(int i = 0; i < large_vec_size; ++i) {
test_vec[j * large_vec_size + i] = i + j + test_vec[j * large_vec_size + i] * 1e-13;
}
}
}
auto t2 = high_resolution_clock::now();
duration<double, std::milli> ms_double = t2 - t1;
std::cout << ms_double.count() << "ms\n";
return 0;
}
I compiled it using four different approaches:
- With the latest intel compiler, using
icpc main.cpp -o test
- With the latest intel compiler, using
icpc -qopenmp main.cpp -o test -liomp5
- With GCC 11.2, using
g++ main.cpp -o test
- With GCC 11.2, using
g++ -fopenmp main.cpp -o test -lgomp
My obtained results were:
- Warning "unrecognized OpenMP #pragma #pragma omp parallel for collapse(2)", runtime: 2490 ms
- No warning, runtime: 14080 ms
- No warning, runtime: 45550 ms
- No warning, runtime: 13400 ms
The result for GCC is as more or less as expected, I am running it on four cores, and my speed-up is slightly larger than three. But for the intel compiler I do not understand the result: Why is it so much faster, especially when disregarding the OpenMP-pragma?
To extend the benchmark data after requests in the comments, my compile lines are
g++ main.cpp -o test_gcc_clean
g++ -fopenmp main.cpp -o test_gcc_omp -lgomp
g++ -fopenmp -march=native -mavx -O3 main.cpp -o test_gcc_opt -lgomp
icpc main.cpp -o test_icc_clean
icpc -qopenmp main.cpp -o test_icc_omp -liomp5
icpc -qopenmp -march=native -mavx -O3 main.cpp -o test_icc_opt -liomp5
and my run file is:
echo "Clean GCC"
./test_gcc_clean
echo "GCC with OpenMP"
./test_gcc_omp
echo "Optimized GCC"
./test_gcc_opt
echo "Clean ICC"
./test_icc_clean
echo "ICC with OpenMP"
./test_icc_omp
echo "Optimized ICC"
./test_icc_opt
The output then is:
Clean GCC
45641.6ms
GCC with OpenMP
13358.6ms
Optimized GCC
4949.53ms
Clean ICC
2471.96ms
ICC with OpenMP
14014.6ms
Optimized ICC
13662.9ms
reflecting my earlier results. Interestingly, a program compiled with
icpc -march=native -mavx -O3 main.cpp -o test_icc_nomp
will be even faster, and have a runtime of 1286 ms
, but will throw an error during compilation stating that it does not know the OpenMP pragma.
Edit:
Following the suggestions in the answer I decided to extend the question for the sake of completeness. It should test the assumption that the comparison between size_t
and int
slows down the code, and a final verification to avoid optimization removal of the test vectors. Therefore, I used the following code:
#include <iostream>
#include <omp.h>
#include <vector>
#include <chrono>
#include <algorithm>
constexpr int bench_rounds = 32;
int main(void) {
using std::chrono::high_resolution_clock;
using std::chrono::duration_cast;
using std::chrono::duration;
using std::chrono::milliseconds;
size_t vec_size = 16;
size_t large_vec_size = 1024*16;
std::vector<double> test_vec(large_vec_size * large_vec_size, 0),
test_vec_II(large_vec_size * large_vec_size, 0),
test_vec_III(large_vec_size * large_vec_size, 0),
test_vec_IV(large_vec_size * large_vec_size, 0);
auto t1 = high_resolution_clock::now();
for(int k = 0; k < bench_rounds; ++k) {
#pragma omp parallel for collapse(2)
for(int j = 0; j < large_vec_size; ++j) {
for(int i = 0; i < large_vec_size; ++i) {
test_vec[j * large_vec_size + i] = i + j + test_vec[j * large_vec_size + i] * 1e-13;
}
}
}
auto t2 = high_resolution_clock::now();
auto t3 = high_resolution_clock::now();
for(int k = 0; k < bench_rounds; ++k) {
#pragma omp parallel for
for(int j = 0; j < large_vec_size; ++j) {
#pragma omp simd
for(int i = 0; i < large_vec_size; ++i) {
test_vec_II[j * large_vec_size + i] = i + j + test_vec_II[j * large_vec_size + i] * 1e-13;
}
}
}
auto t4 = high_resolution_clock::now();
auto t5 = high_resolution_clock::now();
for(int k = 0; k < bench_rounds; ++k) {
#pragma omp parallel for collapse(2)
for(size_t j = 0; j < large_vec_size; ++j) {
for(size_t i = 0; i < large_vec_size; ++i) {
test_vec_III[j * large_vec_size + i] = i + j + test_vec_III[j * large_vec_size + i] * 1e-13;
}
}
}
auto t6 = high_resolution_clock::now();
auto t7 = high_resolution_clock::now();
for(int k = 0; k < bench_rounds; ++k) {
#pragma omp parallel for
for(size_t j = 0; j < large_vec_size; ++j) {
#pragma omp simd
for(size_t i = 0; i < large_vec_size; ++i) {
test_vec_IV[j * large_vec_size + i] = i + j + test_vec_IV[j * large_vec_size + i] * 1e-13;
}
}
}
auto t8 = high_resolution_clock::now();
duration<double, std::milli> ms_double = t2 - t1,
ms_double_simd = t4 - t3,
ms_double_sizet = t6 - t5,
ms_double_simd_sizet = t8 - t7;
std::cout << "Coll: " << ms_double.count() << " ms\n";
std::cout << "SIMD: " << ms_double_simd.count() << " ms\n";
std::cout << "CoST: " << ms_double_sizet.count() << " ms\n";
std::cout << "SIST: " << ms_double_simd_sizet.count() << " ms\n";
std::cout << "Vectors are equal: ";
if(std::equal(test_vec.begin(), test_vec.begin() + large_vec_size * large_vec_size, test_vec_II.begin())) {
std::cout << "True\n";
} else {
std::cout << "False\n";
}
std::cout << "Vectors are equal: ";
if(std::equal(test_vec.begin(), test_vec.begin() + large_vec_size * large_vec_size, test_vec_III.begin())) {
std::cout << "True\n";
} else {
std::cout << "False\n";
}
std::cout << "Vectors are equal: ";
if(std::equal(test_vec.begin(), test_vec.begin() + large_vec_size * large_vec_size, test_vec_IV.begin())) {
std::cout << "True\n";
} else {
std::cout << "False\n";
}
return 0;
}
and obtained the following results:
Clean GCC
Coll: 46281.8 ms
SIMD: 47917.9 ms
CoST: 44322 ms
SIST: 44275.4 ms
Vectors are equal: True
Vectors are equal: True
Vectors are equal: True
GCC with OpenMP
Coll: 13799.6 ms
SIMD: 14546 ms
CoST: 12913.8 ms
SIST: 13113.1 ms
Vectors are equal: True
Vectors are equal: True
Vectors are equal: True
Optimized GCC
Coll: 4955.54 ms
SIMD: 5080.45 ms
CoST: 5203.64 ms
SIST: 5011.17 ms
Vectors are equal: True
Vectors are equal: True
Vectors are equal: True
Optimized GCC, no OpenMP
Coll: 5201.49 ms
SIMD: 5198.48 ms
CoST: 6148.23 ms
SIST: 6279.94 ms
Vectors are equal: True
Vectors are equal: True
Vectors are equal: True
Clean ICC
Coll: 2579.12 ms
SIMD: 5315.75 ms
CoST: 5296.52 ms
SIST: 6892.02 ms
Vectors are equal: True
Vectors are equal: True
Vectors are equal: True
ICC with OpenMP
Coll: 14466 ms
SIMD: 4974.81 ms
CoST: 13539.5 ms
SIST: 4963.63 ms
Vectors are equal: True
Vectors are equal: True
Vectors are equal: True
Optimized ICC
Coll: 15753.4 ms
SIMD: 5114.96 ms
CoST: 13509.4 ms
SIST: 5100.88 ms
Vectors are equal: True
Vectors are equal: True
Vectors are equal: True
Optimized ICC, no OpenMP
Coll: 1302.34 ms
SIMD: 5200.3 ms
CoST: 5535.02 ms
SIST: 5565.15 ms
Vectors are equal: True
Vectors are equal: True
Vectors are equal: True
At least on my platform I get some interesting results:
- A loop with
int
andsize_t
can be optimized better by ICC (and in some cases for GCC) than a loop with onlysize_t
#pragma simd
is faster than#pragma omp collapse()
, but the version without OpenMP is still significantly faster, especially for ICC