How about this C++20 unrolling-helpers:
#pragma once
#include <utility>
#include <concepts>
#include <iterator>
template<size_t N, typename Fn>
requires (N >= 1) && requires( Fn fn, size_t i ) { { fn( i ) } -> std::same_as<void>; }
inline
void unroll( Fn fn )
{
auto unroll_n = [&]<size_t ... Indices>( std::index_sequence<Indices ...> )
{
(fn( Indices ), ...);
};
unroll_n( std::make_index_sequence<N>() );
}
template<size_t N, typename Fn>
requires (N >= 1) && requires( Fn fn ) { { fn() } -> std::same_as<void>; }
inline
void unroll( Fn fn )
{
auto unroll_n = [&]<size_t ... Indices>( std::index_sequence<Indices ...> )
{
return ((Indices, fn()), ...);
};
unroll_n( std::make_index_sequence<N>() );
}
template<size_t N, typename Fn>
requires (N >= 1) && requires( Fn fn, size_t i ) { { fn( i ) } -> std::convertible_to<bool>; }
inline
bool unroll( Fn fn )
{
auto unroll_n = [&]<size_t ... Indices>( std::index_sequence<Indices ...> ) -> bool
{
return (fn( Indices ) && ...);
};
return unroll_n( std::make_index_sequence<N>() );
}
template<size_t N, typename Fn>
requires (N >= 1) && requires( Fn fn ) { { fn() } -> std::convertible_to<bool>; }
inline
bool unroll( Fn fn )
{
auto unroll_n = [&]<size_t ... Indices>( std::index_sequence<Indices ...> ) -> bool
{
return ((Indices, fn()) && ...);
};
return unroll_n( std::make_index_sequence<N>() );
}
template<std::size_t N, typename RandomIt, typename UnaryFunction>
requires std::random_access_iterator<RandomIt>
&& requires( UnaryFunction fn, typename std::iterator_traits<RandomIt>::value_type elem ) { { fn( elem ) }; }
inline
RandomIt unroll_for_each( RandomIt begin, RandomIt end, UnaryFunction fn )
{
RandomIt &it = begin;
if constexpr( N > 1 )
for( ; it + N <= end; it += N )
unroll<N>( [&]( size_t i ) { fn( it[i] ); } );
for( ; it < end; ++it )
fn( *begin );
return it;
}
But be aware that the unrolling-factor is crucial here. Unrolling is not always beneficial and sometimes unrolling beyond the optimal CPU-specific unrolling-factor drops to the performance without unrolling.