I'm working on a complex framework which uses std::function<>
as argument of many functions. By profiling i found one of the performance problem the following.
Can somebody explain me why the Loop3a is so slow? I expected that the inlining will be used and the time will be same. The same for the assembly. Is there any way to improve performance or different way? Does the C++17 makes any change in that way?
#include <iostream>
#include <functional>
#include <chrono>
#include <cmath>
static const unsigned N = 300;
struct Loop3a
{
void impl()
{
sum = 0.0;
for (unsigned i = 1; i <= N; ++i) {
for (unsigned j = 1; j <= N; ++j) {
for (unsigned k = 1; k <= N; ++k) {
sum += fn(i, j, k);
}
}
}
}
std::function<double(double, double, double)> fn = [](double a, double b, double c) {
const auto subFn = [](double x, double y) { return x / (y+1); };
return sin(a) + log(subFn(b, c));
};
double sum;
};
struct Loop3b
{
void impl()
{
sum = 0.0;
for (unsigned i = 1; i <= N; ++i) {
for (unsigned j = 1; j <= N; ++j) {
for (unsigned k = 1; k <= N; ++k) {
sum += sin((double)i) + log((double)j / (k+1));
}
}
}
}
double sum;
};
int main()
{
using Clock = std::chrono::high_resolution_clock;
using TimePoint = std::chrono::time_point<Clock>;
TimePoint start, stop;
Loop3a a;
Loop3b b;
start = Clock::now();
a.impl();
stop = Clock::now();
std::cout << "A: " << std::chrono::duration_cast<std::chrono::milliseconds>(stop - start).count();
std::cout << "ms\n";
start = Clock::now();
b.impl();
stop = Clock::now();
std::cout << "B: " << std::chrono::duration_cast<std::chrono::milliseconds>(stop - start).count();
std::cout << "ms\n";
return a.sum == b.sum;
}
Sample output using g++5.4 with "-O2 -std=c++14":
A: 1794ms
B: 906ms
In the profiler i can see many of this internals:
double&& std::forward<double>(std::remove_reference<double>::type&)
std::_Function_handler<double (double, double, double), Loop3a::fn::{lambda(double, double, double)#1}>::_M_invoke(std::_Any_data const&, double, double, double)
Loop3a::fn::{lambda(double, double, double)#1}* const& std::_Any_data::_M_access<Loop3a::fn::{lambda(double, double, double)#1}*>() const