I have a problem with the behavior of std::launch::async
I have two cases where I use std::launch::async:
Case 1:
const AnotherClass AnyClass::train(args) const{
// first do some stuff
std::vector<uint> thread_ranges;
split_sequence(thread_ranges, 0, num_groups - 1, options.get_num_threads());
std::vector<std::future<std::vector<std::shared_ptr<AnotherClass> > > > futures;
futures.reserve(thread_ranges.size());
std::vector<std::shared_ptr<Tree> > results;
results.reserve(num_trees);
for (uint i = 0; i < thread_ranges.size() - 1; ++i) {
futures.push_back(std::async(std::launch::async,
&ForestTrainer::AnotherClassFunction,
this,
someArgs));
}
for (auto& future : futures) {
std::vector<std::shared_ptr<AnotherClass> > thread_results = future.get();
results.insert(results.end(), thread_results.begin(), thread_results.end());
}
// then do some other stuff
}
std::vector<std::shared_ptr<Tree> > ForestTrainer::train_batch(someArgs
{
// do a lot of stuff
}
And Case 2:
std::tuple<T1, T2> innerFunction(manyArgs){
// do a lot of stuff
return std::make_tuple(someThing, otherThing);
}
Eigen::MatrixXd outerFunction(args){
// do some stuff
std::vector<T1> results_1;
std::vector<T2> results_2;
for(uint i =0; i<num_trees;i=i+num_threads){
std::vector<std::future<std::tuple<T1,T2> > > futures = std::vector<std::future<std::tuple<T1,T2> > >(threads_to_run);
for(uint j=0;j<num_threads; j++){
futures[j]=std::async(std::launch::async,
&innerFunction,
trees.at(i+j),
otherArgs);
}
for (auto& future : futures) {
T1 get_1;
T2 get_2;
std::tie(get_1, get_2)=future.get();
if(get_2.empty()){
results_1.insert(results_1.end(), get_1.begin(), get_1.end());
} else {
results_2.insert(results_2.end(), get_2.begin(), get_2.end());
}
}
// do some stuff
}
// do some stuff
}
When I debug and run the code in Ecplise C++, everything works fine and I get 100% CPU usage.
When I compile the Code as part of an R-package using Rcpp, only in Case 1, I get full CPU usage, but in Case 2, the threads run sequentially with only 1/nThreads CPU usage.
I tried to simplify the code as much as possible here but still keep the important features. If something is missing that could be helpful, let me know.
I appreciate your help.