Assuming the following relations
a = M u
b = M v
c = M w
Where a, b
and c
are three [6x1]
vectors, M
is a [6x15]
matrix and u, v
and w
are three [15x1]
.
These operations are done for a set of several different M
matrices. I thus have a stack of M
matrices, namely M_stack
with dimensions ranging from [2000x15]
to [3000x15]
depending on the length of the stack. Multiplying u, v
and w
by M_stack
gives a_stack, b_stack
and c_stack
with dimensions ranging from [2000x1]
to [3000x1]
.
a_stack = M_stack u
b_stack = M_stack v
c_stack = M_stack w
Now I was thinking that I could speed up this operation stacking u, v
and w
in a matrix Z = [u v w]
with dimensions [15x3]
and having D = [a_stack b_stack c_stack]
with dimensions ranging from [2000x3]
to [3000x3]
D = M_stack Z
I thought that this would have been faster as Eigen should optimize and parallelize Matrix-Matrix multiplication. However, I found that the version where I compute a_stack, b_stack
and c_stack
individually is faster.
Here's an example code using google benchmark.
#include <Eigen/Dense>
#include <iostream>
#include <benchmark/benchmark.h>
struct Stack {
void update()
{
D = M_stack*Z;
}
void updateInSeries()
{
a_stack = M_stack*u;
b_stack = M_stack*v;
c_stack = M_stack*w;
}
void updateNoAlias()
{
D.noalias() = M_stack*Z;
}
void updateInSeriesNoAlias()
{
a_stack.noalias() = M_stack*u;
b_stack.noalias() = M_stack*v;
c_stack.noalias() = M_stack*w;
}
unsigned int rows { 2300 };
Eigen::MatrixXd M_stack { Eigen::MatrixXd::Random(rows, 15) };
Eigen::MatrixXd Z { Eigen::MatrixXd::Random(15, 3) };
Eigen::MatrixXd D { Eigen::MatrixXd::Random(rows, 3) };
Eigen::MatrixXd u { Eigen::MatrixXd::Random(15, 1) };
Eigen::MatrixXd v { Eigen::MatrixXd::Random(15, 1) };
Eigen::MatrixXd w { Eigen::MatrixXd::Random(15, 1) };
Eigen::MatrixXd a_stack { Eigen::MatrixXd::Random(rows, 1) };
Eigen::MatrixXd b_stack { Eigen::MatrixXd::Random(rows, 1) };
Eigen::MatrixXd c_stack { Eigen::MatrixXd::Random(rows, 1) };
void updateSized()
{
D_sized = M_stack_sized*Z_sized;
}
void updateInSeriesSized()
{
a_stack_sized = M_stack_sized*u_sized;
b_stack_sized = M_stack_sized*v_sized;
c_stack_sized = M_stack_sized*w_sized;
}
void updateNoAliasSized()
{
D_sized.noalias() = M_stack_sized*Z_sized;
}
void updateInSeriesNoAliasSized()
{
a_stack_sized.noalias() = M_stack_sized*u_sized;
b_stack_sized.noalias() = M_stack_sized*v_sized;
c_stack_sized.noalias() = M_stack_sized*w_sized;
}
Eigen::Matrix<double, Eigen::Dynamic, 15> M_stack_sized { Eigen::MatrixXd::Random(rows, 15) };
Eigen::Matrix<double, 15, 3> Z_sized { Eigen::MatrixXd::Random(15, 3) };
Eigen::Matrix<double, Eigen::Dynamic, 3> D_sized { Eigen::MatrixXd::Random(rows, 3) };
Eigen::Matrix<double, 15, 1> u_sized { Eigen::MatrixXd::Random(15, 1) };
Eigen::Matrix<double, 15, 1> v_sized { Eigen::MatrixXd::Random(15, 1) };
Eigen::Matrix<double, 15, 1> w_sized { Eigen::MatrixXd::Random(15, 1) };
Eigen::Matrix<double, Eigen::Dynamic, 1> a_stack_sized { Eigen::MatrixXd::Random(rows, 1) };
Eigen::Matrix<double, Eigen::Dynamic, 1> b_stack_sized { Eigen::MatrixXd::Random(rows, 1) };
Eigen::Matrix<double, Eigen::Dynamic, 1> c_stack_sized { Eigen::MatrixXd::Random(rows, 1) };
};
int main(int argc, char *argv[])
{
const unsigned int repetitions = 5;
::benchmark::Initialize(&argc, argv);
::benchmark::RegisterBenchmark("update", [](::benchmark::State &t_state){
Stack stack;
while(t_state.KeepRunning())
stack.update();
})->Repetitions(repetitions)->Unit(::benchmark::kMicrosecond);
::benchmark::RegisterBenchmark("updateInSeries", [](::benchmark::State &t_state){
Stack stack;
while(t_state.KeepRunning())
stack.updateInSeries();
})->Repetitions(repetitions)->Unit(::benchmark::kMicrosecond);
::benchmark::RegisterBenchmark("updateNoAlias", [](::benchmark::State &t_state){
Stack stack;
while(t_state.KeepRunning())
stack.updateNoAlias();
})->Repetitions(repetitions)->Unit(::benchmark::kMicrosecond);
::benchmark::RegisterBenchmark("updateInSeriesNoAlias", [](::benchmark::State &t_state){
Stack stack;
while(t_state.KeepRunning())
stack.updateInSeriesNoAlias();
})->Repetitions(repetitions)->Unit(::benchmark::kMicrosecond);
::benchmark::RegisterBenchmark("updateSized", [](::benchmark::State &t_state){
Stack stack;
while(t_state.KeepRunning())
stack.updateSized();
})->Repetitions(repetitions)->Unit(::benchmark::kMicrosecond);
::benchmark::RegisterBenchmark("updateInSeriesSized", [](::benchmark::State &t_state){
Stack stack;
while(t_state.KeepRunning())
stack.updateInSeriesSized();
})->Repetitions(repetitions)->Unit(::benchmark::kMicrosecond);
::benchmark::RegisterBenchmark("updateNoAliasSized", [](::benchmark::State &t_state){
Stack stack;
while(t_state.KeepRunning())
stack.updateNoAliasSized();
})->Repetitions(repetitions)->Unit(::benchmark::kMicrosecond);
::benchmark::RegisterBenchmark("updateInSeriesNoAliasSized", [](::benchmark::State &t_state){
Stack stack;
while(t_state.KeepRunning())
stack.updateInSeriesNoAliasSized();
})->Repetitions(repetitions)->Unit(::benchmark::kMicrosecond);
::benchmark::RunSpecifiedBenchmarks();
return 0;
}
Running this example gives the following output in my machine.
Run on (16 X 4800 MHz CPU s)
CPU Caches:
L1 Data 48 KiB (x8)
L1 Instruction 32 KiB (x8)
L2 Unified 1280 KiB (x8)
L3 Unified 24576 KiB (x1)
Load Average: 1.11, 0.48, 0.19
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
--------------------------------------------------------------------------------------
Benchmark Time CPU Iterations
--------------------------------------------------------------------------------------
update/repeats:5 25.4 us 25.4 us 28082
update/repeats:5 25.3 us 25.3 us 28082
update/repeats:5 25.1 us 25.1 us 28082
update/repeats:5 25.3 us 25.3 us 28082
update/repeats:5 25.3 us 25.3 us 28082
update/repeats:5_mean 25.3 us 25.3 us 5
update/repeats:5_median 25.3 us 25.3 us 5
update/repeats:5_stddev 0.117 us 0.117 us 5
update/repeats:5_cv 0.46 % 0.46 % 5
updateInSeries/repeats:5 15.2 us 15.2 us 46088
updateInSeries/repeats:5 15.2 us 15.2 us 46088
updateInSeries/repeats:5 15.2 us 15.2 us 46088
updateInSeries/repeats:5 15.2 us 15.2 us 46088
updateInSeries/repeats:5 15.1 us 15.1 us 46088
updateInSeries/repeats:5_mean 15.2 us 15.2 us 5
updateInSeries/repeats:5_median 15.2 us 15.2 us 5
updateInSeries/repeats:5_stddev 0.058 us 0.057 us 5
updateInSeries/repeats:5_cv 0.38 % 0.38 % 5
updateNoAlias/repeats:5 23.6 us 23.6 us 29484
updateNoAlias/repeats:5 23.7 us 23.7 us 29484
updateNoAlias/repeats:5 23.6 us 23.6 us 29484
updateNoAlias/repeats:5 23.5 us 23.5 us 29484
updateNoAlias/repeats:5 24.0 us 24.0 us 29484
updateNoAlias/repeats:5_mean 23.7 us 23.7 us 5
updateNoAlias/repeats:5_median 23.6 us 23.6 us 5
updateNoAlias/repeats:5_stddev 0.203 us 0.203 us 5
updateNoAlias/repeats:5_cv 0.86 % 0.86 % 5
updateInSeriesNoAlias/repeats:5 14.0 us 14.0 us 50044
updateInSeriesNoAlias/repeats:5 14.0 us 14.0 us 50044
updateInSeriesNoAlias/repeats:5 14.0 us 14.0 us 50044
updateInSeriesNoAlias/repeats:5 14.0 us 14.0 us 50044
updateInSeriesNoAlias/repeats:5 14.0 us 14.0 us 50044
updateInSeriesNoAlias/repeats:5_mean 14.0 us 14.0 us 5
updateInSeriesNoAlias/repeats:5_median 14.0 us 14.0 us 5
updateInSeriesNoAlias/repeats:5_stddev 0.020 us 0.021 us 5
updateInSeriesNoAlias/repeats:5_cv 0.15 % 0.15 % 5
updateSized/repeats:5 24.6 us 24.6 us 28335
updateSized/repeats:5 24.5 us 24.5 us 28335
updateSized/repeats:5 24.5 us 24.5 us 28335
updateSized/repeats:5 24.5 us 24.5 us 28335
updateSized/repeats:5 24.4 us 24.4 us 28335
updateSized/repeats:5_mean 24.5 us 24.5 us 5
updateSized/repeats:5_median 24.5 us 24.5 us 5
updateSized/repeats:5_stddev 0.062 us 0.062 us 5
updateSized/repeats:5_cv 0.25 % 0.25 % 5
updateInSeriesSized/repeats:5 14.7 us 14.7 us 47371
updateInSeriesSized/repeats:5 14.7 us 14.7 us 47371
updateInSeriesSized/repeats:5 14.7 us 14.7 us 47371
updateInSeriesSized/repeats:5 14.7 us 14.7 us 47371
updateInSeriesSized/repeats:5 14.7 us 14.7 us 47371
updateInSeriesSized/repeats:5_mean 14.7 us 14.7 us 5
updateInSeriesSized/repeats:5_median 14.7 us 14.7 us 5
updateInSeriesSized/repeats:5_stddev 0.011 us 0.011 us 5
updateInSeriesSized/repeats:5_cv 0.08 % 0.07 % 5
updateNoAliasSized/repeats:5 23.0 us 23.0 us 30493
updateNoAliasSized/repeats:5 23.0 us 23.0 us 30493
updateNoAliasSized/repeats:5 22.9 us 22.9 us 30493
updateNoAliasSized/repeats:5 23.0 us 23.0 us 30493
updateNoAliasSized/repeats:5 23.0 us 23.0 us 30493
updateNoAliasSized/repeats:5_mean 23.0 us 23.0 us 5
updateNoAliasSized/repeats:5_median 23.0 us 23.0 us 5
updateNoAliasSized/repeats:5_stddev 0.033 us 0.033 us 5
updateNoAliasSized/repeats:5_cv 0.14 % 0.14 % 5
updateInSeriesNoAliasSized/repeats:5 13.8 us 13.8 us 50879
updateInSeriesNoAliasSized/repeats:5 13.8 us 13.8 us 50879
updateInSeriesNoAliasSized/repeats:5 13.8 us 13.8 us 50879
updateInSeriesNoAliasSized/repeats:5 13.8 us 13.8 us 50879
updateInSeriesNoAliasSized/repeats:5 13.7 us 13.7 us 50879
updateInSeriesNoAliasSized/repeats:5_mean 13.8 us 13.8 us 5
updateInSeriesNoAliasSized/repeats:5_median 13.8 us 13.8 us 5
updateInSeriesNoAliasSized/repeats:5_stddev 0.012 us 0.012 us 5
updateInSeriesNoAliasSized/repeats:5_cv 0.09 % 0.09 % 5
The benchmark actually report the opposite of my assumption: the Matrix-Matrix form is much slower then the sequence of Matrix-Vector multiplication.
Can someone explain this phenomena ?
Moreover, these computation are the bottleneck of my application. Does someone have some insight on how these operations can be made more efficient?
Here is my CMakeLists file
cmake_minimum_required(VERSION 3.5)
project(cpp_sandbox LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
find_package(benchmark REQUIRED)
find_package(Eigen3 REQUIRED)
add_executable(stack_alias stack_alias.cpp)
target_link_libraries(stack_alias
PRIVATE
Eigen3::Eigen
benchmark::benchmark
)
Code is compiled in Release
mode. I do not specify any optimization level to let Eigen optimize by itself (found it's the fastest approach).