Found an algorithm that doesn't require additional memory and works for any binary search tree (red-black/AVL/etc):
incoming data array to represent a "flattened" binary tree (root at [0], root children at [1] and [2], left node children at [3] and [4], right node children at [5] and [6] and etc). Trick is to select roots of every subtree in such way that resulting binary tree has every lvl (but the last one) filled and on the last level all nodes form an "uninterrupted" line. Like this:
N11
/ \
N21 N22
/ \ /
N31 N32 N33
See code below on how to transform sorted array into such sequence. I believe for any sequence there is only one possible way to arrange it in a binary search tree like that -- i.e. you end up with some sort of "stability" guarantee here (for given sequence length we know exactly where each element will end up in the tree).
- then you perform one pass over your data and fill your tree level-by-level. At each level we know exactly how many elements to pull (2^(lvl-1)) before switching to next lvl (or running out of data). At the start of each iteration we reset our position to the leftmost element (
std::set<T>::begin()
) and after inserting left and right children we move to next leaf on current level (double ++it
from result of last insert()
call).
Notes:
with std::set<int>
performance benefit (compared to hinted insert of sorted sequence is 5-10%)
unfortunately, MS red-black tree implementation ends up performing a lot of unnecessary work here -- checking neighboring elements (to ensure insert doesn't break binary tree invariant), repainting nodes (newly inserted node for some reason is always red) and probably smth else. Checking neighbors involves additional comparisons and memory accesses as well as traversing tree up multiple levels
benefit of this approach would be significantly higher if it was implemented internally (not using std::set
public interface) as a function that expect data to be conforming requirements and declaring "undefined behavior" if it isn't...
... in this case even better algorithm would populate tree depth-first and would require input data to be rearranged differently ([N11, N21, N31, N32, N22, N33]
in the example above). We will end up doing only one tree traversal too... Alas, can't implement this approach using std::set
public interface, though -- it will enforce red-black tree invariant on every step of construction causing unnecessary rebalancing
Code: (MSVC 2015, pardon for potato quality -- it was written on the knee in like an hour)
#include <set>
#include <cassert>
#include <vector>
#include <utility>
#include <chrono>
#include <cstdio>
using namespace std;
unsigned hibit(size_t n)
{
unsigned long l;
auto r = _BitScanReverse(&l, n);
assert(r);
return l;
}
int const* pick_root(int const* begin, int const* end)
{
assert(begin != end);
size_t count = end - begin;
unsigned tree_order = hibit(count); // tree height minus 1
size_t max_tail_sz = 1 << tree_order; // max number of nodes on last tree lvl
size_t filled_sz = max_tail_sz - 1; // number of nodes on all but last tree lvl
size_t tail_sz = count - filled_sz; // number of nodes on last lvl
return (tail_sz >= max_tail_sz/2) ? // left half of tree will be completely filled?
begin + max_tail_sz - 1 // pick (2^tree_order)'s element from left
:
end - max_tail_sz/2; // pick (2^(tree_order - 1))'s element from right
}
vector<int> repack(vector<int> const& v)
{
vector<int> r; r.reserve(v.size());
if (!v.empty())
{
unsigned tree_order = hibit(v.size()); // tree height minus 1
vector<pair<int const*, int const*>> ranges(1, make_pair(&v[0], &v[0] + v.size()));
for(size_t i = 0; i <= tree_order; ++i)
{
vector<pair<int const*, int const*>> ranges2; ranges2.reserve(ranges.size()*2);
for(auto const& range: ranges)
{
auto root = pick_root(range.first, range.second);
r.push_back(*root);
if (root != range.first)
{
ranges2.push_back(make_pair(range.first, root));
if (root + 1 != range.second)
ranges2.push_back(make_pair(root + 1, range.second));
}
}
ranges.swap(ranges2);
}
assert(ranges.empty());
}
return r;
}
set<int> populate_simple(std::vector<int> const& vec)
{
set<int> r;
for(auto v: vec) r.insert(v);
return r;
}
set<int> populate_hinted(std::vector<int> const& vec)
{
set<int> r;
for(auto v: vec) r.insert(r.end(), v);
return r;
}
set<int> populate_optimized(std::vector<int> const& vec)
{
set<int> r;
if (vec.empty()) return r;
int const* p = &vec[0];
int const* pend = &vec[0] + vec.size();
r.insert(*p++); // take care of root
if (p == pend) return r;
for(size_t count = 1; ; count *= 2) // max number of pairs on each tree lvl
{
auto pos = r.begin();
for(size_t i = 1; ; ++i)
{
r.insert(pos, *p++);
if (p == pend) return r;
//++pos; // MS implementation supports insertion after hint
pos = r.insert(pos, *p++);
if (p == pend) return r;
// pos points to rightmost leaf of left subtree of "local" tree
++pos; // pos points to root of "local" tree (or end())
if (i == count) break;
++pos; // pos points to leftmost leaf of right subtree of "local" tree
}
}
}
struct stopwatch
{
chrono::high_resolution_clock::time_point start_;
stopwatch() : start_(std::chrono::high_resolution_clock::now()) {}
auto click()
{
auto finish = std::chrono::high_resolution_clock::now();
auto mks = std::chrono::duration_cast<std::chrono::microseconds>(finish - start_);
return mks.count();
}
};
int main()
{
size_t N = 100000;
vector<int> v(N, 0);
for(unsigned i = 0; i < N; ++i) v[i] = i; // sorted array
auto rv = repack(v);
{
stopwatch w;
auto s = populate_simple(v);
printf("simple : %I64d mks\n", w.click());
}
{
stopwatch w;
auto s = populate_hinted(v);
printf("hinted : %I64d mks\n", w.click());
}
{
stopwatch w;
auto s = populate_optimized(rv);
printf("optimized: %I64d mks\n", w.click());
}
return 0;
}
Typical results:
simple : 14904 mks
hinted : 7885 mks
optimized: 6809 mks
simple : 15288 mks
hinted : 7415 mks
optimized: 6947 mks
I am pretty sure measurements aren't entirely accurate, but relation always stands -- optimized version is always faster. Also, note that algorithm used to rearrange elements might probably be improved -- aim was to optimize tree population (not input data preparation).