There's a bit more advanced technique for making such threads run even more simultaneously.
The problem with the naive approach is, the threads created in the beginning have too much time to run their functions before the last threads are even created. So, when the last threads are just created, the first ones have already executed significant parts of their functions.
In order to avoid that, we can use a counter (protected by a mutex) and a condition variable. Each thread that has been created and is now ready to start running its internal function, will increment the counter and check if it has become equal to the total number of threads (i.e., if this thread was the last one to increment the counter). If it was, it will notify all the other threads (using the condition variable) that it's time to start. Otherwise, it will wait on the condition variable until some other thread sets the counter to their total number and notifies the remaining threads (including this one).
This way, all the threads will start (almost) simultaneously, only after each and every one of them has been created and is actually ready to execute its function.
Here is my implementation of a class ConcurrentRunner
which does that.
First, a C++11-compliant simplified version that will be easier to understand:
#include <mutex>
#include <condition_variable>
#include <vector>
#include <functional>
#include <thread>
// Object that runs multiple functions, each in its own thread, starting them as simultaneously as possible.
class ConcurrentRunner final
{
public:
template<typename... BackgroundThreadsFunctions>
explicit ConcurrentRunner(const std::function<void()>& this_thread_function, const BackgroundThreadsFunctions&... background_threads_functions)
: _this_thread_function{this_thread_function}
, _num_threads_total{1 + sizeof...(BackgroundThreadsFunctions)}
{
this->PrepareBackgroundThreads({ background_threads_functions... });
}
ConcurrentRunner(const ConcurrentRunner&) = delete;
ConcurrentRunner& operator=(const ConcurrentRunner&) = delete;
// Executes `ThreadProc` for this thread's function and waits for all of the background threads to finish.
void Run()
{
this->ThreadProc(_this_thread_function);
for (auto& background_thread : _background_threads)
background_thread.join();
}
private:
// Creates the background threads: each of them will execute `ThreadProc` with its respective function.
void PrepareBackgroundThreads(const std::vector<std::function<void()>>& background_threads_functions)
{
// Iterate through the vector of the background threads' functions and create a new thread with `ThreadProc` for each of them.
_background_threads.reserve(background_threads_functions.size());
for (const auto& background_thread_function : background_threads_functions)
{
_background_threads.emplace_back([this, function = background_thread_function]()
{
this->ThreadProc(function);
});
}
}
// Procedure that will be executed by each thread, including the "main" thread and all background ones.
void ThreadProc(const std::function<void()>& function)
{
// Increment the `_num_threads_waiting_for_start_signal` while the mutex is locked, thus signalizing that a new thread is ready to start.
std::unique_lock<std::mutex> lock{_mutex};
++_num_threads_waiting_for_start_signal;
const bool ready_to_go = (_num_threads_waiting_for_start_signal == _num_threads_total);
lock.unlock();
if (ready_to_go)
{
// If this thread was the last one of the threads which must start simultaneously, notify all other threads that they are ready to start.
_cv.notify_all();
}
else
{
// If this thread was not the last one of the threads which must start simultaneously, wait on `_cv` until all other threads are ready.
lock.lock();
_cv.wait(lock, [this]()
{
return (_num_threads_waiting_for_start_signal == _num_threads_total);
});
lock.unlock();
}
// Execute this thread's internal function.
function();
}
private:
std::function<void()> _this_thread_function;
std::vector<std::thread> _background_threads;
const unsigned int _num_threads_total;
unsigned int _num_threads_waiting_for_start_signal{0}; // counter of the threads which are ready to start running their functions
mutable std::mutex _mutex; // mutex that protects the counter
std::condition_variable _cv; // waited on by all threads but the last one; notified when the last thread increments the counter
};
//---------------------------------------------------------------------------------------------------------------------------------------------------
// Example of usage:
#include <atomic>
int main()
{
std::atomic<int> x{0};
{
ConcurrentRunner runner{[&]() { x += 1; }, [&]() { x += 10; }, [&]() { x += 100; }};
runner.Run();
}
return (x.load() == 111) ? 0 : -1;
}
And now the same logic with more templates, less allocations, no unnecessary copies and type erasure, but somewhat harder to read (requires C++17):
//---------------------------------------------------------------------------------------------------------------------------------------------------
// Helper template `ForEachTupleElement` (meant to be in some other header file).
#include <tuple>
#include <type_traits>
#include <utility>
namespace Detail
{
template<typename Tuple, typename Function, std::size_t... I>
constexpr void ForEachTupleElement(Tuple&& tuple, Function function, std::index_sequence<I...>)
{
int dummy[] = { 0, (((void)(function(std::get<I>(std::forward<Tuple>(tuple))))), 0)... };
(void)dummy;
}
}
// Applies a given function (typically - with a template operator(), e.g., a generic lambda) to each element of a tuple.
template<typename Tuple, typename Function, std::size_t... I>
constexpr void ForEachTupleElement(Tuple&& tuple, Function function)
{
Detail::ForEachTupleElement(std::forward<Tuple>(tuple), function,
std::make_index_sequence<std::tuple_size_v<std::remove_cv_t<std::remove_reference_t<Tuple>>>>{});
}
//---------------------------------------------------------------------------------------------------------------------------------------------------
#include <mutex>
#include <condition_variable>
#include <array>
#include <thread>
#include <tuple>
#include <type_traits>
#include <utility>
// Common non-template part of the `ConcurrentRunner` implementation.
class ConcurrentRunnerBase
{
protected:
inline ConcurrentRunnerBase() = default;
inline ~ConcurrentRunnerBase() = default;
protected:
unsigned int _num_threads_waiting_for_start_signal{0}; // protected by `mutex`
mutable std::mutex _mutex;
std::condition_variable _cv; // waited on by all threads but the last one; notified when the last thread increments the counter
};
// Object that runs multiple functions, each in its own thread, starting them as simultaneously as possible.
template<typename ThisThreadFunction, std::size_t NumberOfBackgroundThreads>
class ConcurrentRunner final : private ConcurrentRunnerBase
{
public:
template<typename ThisThreadFunctionArg, typename... BackgroundThreadsFunctions>
explicit ConcurrentRunner(ThisThreadFunctionArg&& this_thread_function, BackgroundThreadsFunctions&&... background_threads_functions)
: _this_thread_function{std::forward<ThisThreadFunctionArg>(this_thread_function)}
{
static_assert(sizeof...(BackgroundThreadsFunctions) == NumberOfBackgroundThreads);
this->Prepare(std::forward<BackgroundThreadsFunctions>(background_threads_functions)...);
}
ConcurrentRunner(const ConcurrentRunner&) = delete;
ConcurrentRunner& operator=(const ConcurrentRunner&) = delete;
// Executes `ThreadProc` for this thread's function and waits for all of the background threads to finish.
void Run()
{
this->ThreadProc(std::move(_this_thread_function));
for (auto& background_thread : _background_threads)
background_thread.join();
}
private:
// Creates the background threads: each of them will execute `ThreadProc` with its respective function.
template<typename... BackgroundThreadsFunctions>
void Prepare(BackgroundThreadsFunctions&&... background_threads_functions)
{
// Copies of the argument functions (created by move constructors where possible), collected in a tuple.
std::tuple<std::decay_t<BackgroundThreadsFunctions>...> background_threads_functions_tuple{
std::forward<BackgroundThreadsFunctions>(background_threads_functions)...
};
// Iterate through the tuple of the background threads' functions and create a new thread with `ThreadProc` for each of them.
unsigned int index_in_array = 0;
ForEachTupleElement(std::move(background_threads_functions_tuple), [this, &index_in_array](auto&& function)
{
auto i = index_in_array++;
_background_threads[i] = std::thread{[this, function = std::move(function)]() mutable
{
this->ThreadProc(std::move(function));
}};
});
}
// Procedure that will be executed by each thread, including the "main" thread and all background ones.
template<typename Function>
void ThreadProc(Function&& function)
{
// Increment the `_num_threads_waiting_for_start_signal` while the mutex is locked, thus signalizing that a new thread is ready to start.
std::unique_lock lock{_mutex};
++_num_threads_waiting_for_start_signal;
const bool ready_to_go = (_num_threads_waiting_for_start_signal == (1 + NumberOfBackgroundThreads));
lock.unlock();
if (ready_to_go)
{
// If this thread was the last one of the threads which must start simultaneously, notify all other threads that they are ready to start.
_cv.notify_all();
}
else
{
// If this thread was not the last one of the threads which must start simultaneously, wait on `_cv` until all other threads are ready.
lock.lock();
_cv.wait(lock, [this]() noexcept -> bool
{
return (_num_threads_waiting_for_start_signal == (1 + NumberOfBackgroundThreads));
});
lock.unlock();
}
// Execute this thread's internal function.
std::forward<Function>(function)();
}
private:
ThisThreadFunction _this_thread_function;
std::array<std::thread, NumberOfBackgroundThreads> _background_threads;
};
template<typename T, typename... U>
ConcurrentRunner(T&&, U&&...) -> ConcurrentRunner<std::decay_t<T>, sizeof...(U)>;
//---------------------------------------------------------------------------------------------------------------------------------------------------
// Example of usage:
#include <atomic>
int main()
{
std::atomic<int> x{0};
{
ConcurrentRunner runner{[&]() { x += 1; }, [&]() { x += 10; }, [&]() { x += 100; }};
runner.Run();
}
return (x.load() == 111) ? 0 : -1;
}