3

Searching for the best algorithm I found there is a tradeoff: complexity to implement and big constant on the one hand, and runtime complexity on the other hand. I choose LU-decomposition-based algorithm, because it is quite simple to implement and have good enough performance.

#include <valarray>
#include <vector>
#include <utility>

#include <cmath>
#include <cstddef>
#include <cassert>

template< typename value_type >
struct math
{

    using size_type = std::size_t;

    size_type const dimension_;
    value_type const & eps;

    value_type const zero = value_type(0);
    value_type const one = value_type(1);

private :

    using vector = std::valarray< value_type >;
    using matrix = std::vector< vector >;

    matrix matrix_;
    matrix minor_;

public :

    math(size_type const _dimension,
         value_type const & _eps)
        : dimension_(_dimension)
        , eps(_eps)
        , matrix_(dimension_)
        , minor_(dimension_ - 1)
    { 
        assert(1 < dimension_);
        assert(!(eps < zero));
        for (size_type r = 0; r < dimension_; ++r) {
            matrix_[r].resize(dimension_);
        }
        size_type const minor_size = dimension_ - 1;
        for (size_type r = 0; r < minor_size; ++r) {
            minor_[r].resize(minor_size);
        }
    }

    template< typename rhs = matrix >
    void
    operator = (rhs const & _matrix)
    {
        auto irow = std::begin(matrix_);
        for (auto const & row_ : _matrix) {
            auto icol = std::begin(*irow);
            for (auto const & v : row_) {
                *icol = v;
                ++icol;
            }
            ++irow;
        }
    }

    value_type
    det(matrix & _matrix,
        size_type const _dimension)
    { // calculates lower unit triangular matrix and upper triangular
        assert(0 < _dimension);
        value_type det_ = one;
        for (size_type i = 0; i < _dimension; ++i) {
            vector & ri_ = _matrix[i];
            using std::abs;
            value_type max_ = abs(ri_[i]);
            size_type pivot = i;
            {
                size_type p = i;
                while (++p < _dimension) {
                    value_type y_ = abs(_matrix[p][i]);
                    if (max_ < y_) {
                        max_ = std::move(y_);
                        pivot = p;
                    }
                }
            }
            if (!(eps < max_)) { // regular?
                return zero; // singular
            }
            if (pivot != i) {
                det_ = -det_; // each permutation flips sign of det
                ri_.swap(_matrix[pivot]);
            }
            value_type & dia_ = ri_[i];
            det_ *= dia_; // det is multiple of diagonal elements
            for (size_type j = 1 + i; j < _dimension; ++j) {
                _matrix[j][i] /= dia_;
            }
            for (size_type a = 1 + i; a < _dimension; ++a) {
                vector & a_ = minor_[a - 1];
                value_type const & ai_ = _matrix[a][i];
                for (size_type b = 1 + i; b < _dimension; ++b) {
                    a_[b - 1] = ai_ * ri_[b];
                }
            }
            for (size_type a = 1 + i; a < _dimension; ++a) {
                vector const & a_ = minor_[a - 1];
                vector & ra_ = _matrix[a];
                for (size_type b = 1 + i; b < _dimension; ++b) {
                    ra_[b] -= a_[b - 1];
                }
            }
        }
        return det_;
    }

    value_type
    det(size_type const _dimension)
    {
        return det(matrix_, _dimension);
    }

    value_type
    det()
    {
        return det(dimension_);
    }

};

// main.cpp
#include <iostream>

#include <cstdlib>

int
main()
{
    using value_type = double;
    value_type const eps = std::numeric_limits< value_type >::epsilon();
    std::size_t const dimension_ = 3;
    math< value_type > m(dimension_, eps);
    m = { // example from https://en.wikipedia.org/wiki/Determinant#Laplace.27s_formula_and_the_adjugate_matrix
            {-2.0, 2.0, -3.0},
            {-1.0, 1.0,  3.0},
            { 2.0, 0.0, -1.0}
        };
    std::cout << m.det() << std::endl; // 18
    return EXIT_SUCCESS;
}

LIVE DEMO

det() function is hottest function in the algorithm, that uses it as a part. I sure det() is not as fast as it can be, because runtime performance comparisons (using google-pprof) to reference implementation of the whole algorithm shows a disproportion towards det().

How to improve performance of det() function? What are evident optimizations to apply immediately? Should I change the indexing and memory access order or something else? Container types? Prefetching?

Typical value of dimension_ is in the range of 3 to 10 (but can be 100, if value_type is mpfr or something else).

Community
  • 1
  • 1
Tomilov Anatoliy
  • 15,657
  • 10
  • 64
  • 169
  • Using mpfr types smells of poor performance. Did you compare to reference implementations with the same `value_type` (and dimensions)? – Walter Oct 29 '15 at 10:36
  • @Walter Yes, but I've tested it only against `std::is_floating_point` types. `O()` asymptotic complexity seems to be equal (with some multiplicative constant `>1`) to *qhull*'s one. I've tested it using the same `value_type` and against identical inputs. The trouble is in a disproportion only. – Tomilov Anatoliy Oct 29 '15 at 10:40
  • Some obvious optimisations is to use automatic temporary variables instead of references in inner loops (though the compiler should do that for you, but never rely on compilers): `const value_type dia_ = ri_[i];` instead of `value_type & dia_ = ri_[i];` and `const value_type ai_ = _matrix[a][i];` instead of `value_type const & ai_ = _matrix[a][i];`. Also, the last two `for` loops can be combined (I think) giving better cache usage in case of large matrices) – Walter Oct 29 '15 at 10:50
  • @Walter references may have sense right for mpfr-like `value_type`s, but not for fundamental types though. – Tomilov Anatoliy Oct 29 '15 at 11:10
  • Why should refernces make sense for mpfr `value_type`s? – Walter Oct 29 '15 at 11:13
  • @Walter Say, if I use non-lazy implementation of algebraic numbers, then each copying leads to additional memory usage. As opposite, for fundamental types I think it's judicious to leave the decision to the compiler. – Tomilov Anatoliy Oct 29 '15 at 11:22
  • [Final version](http://coliru.stacked-crooked.com/a/181f9d8ea837556e). – Tomilov Anatoliy Oct 30 '15 at 07:00

1 Answers1

1

Isn't your (snippet from det())

       for (size_type a = 1 + i; a < _dimension; ++a) {
            vector & a_ = minor_[a - 1];
            value_type const & ai_ = _matrix[a][i];
            for (size_type b = 1 + i; b < _dimension; ++b) {
                a_[b - 1] = ai_ * ri_[b];
            }
        }
        for (size_type a = 1 + i; a < _dimension; ++a) {
            vector const & a_ = minor_[a - 1];
            vector & ra_ = _matrix[a];
            for (size_type b = 1 + i; b < _dimension; ++b) {
                ra_[b] -= a_[b - 1];
            }
        }

doing the same as

       for (size_type a = 1 + i; a < _dimension; ++a) {
            vector & ra_ = _matrix[a];
            value_type ai_ = ra_[i];
            for (size_type b = 1 + i; b < _dimension; ++b) {
                ra_[b] -= ai_ * ri_[b];
            }
        }

without any need for minor_? Moreover, now the inner loop can easily be vectorised.

Walter
  • 44,150
  • 20
  • 113
  • 196