8

I'm observing odd behavior of std::unordered_map in MSVC14 (VS2015). Consider following scenario. I create an unordered map and fill it with dummy struct which consumes considerable amount of memory, lets say 1Gb, overall 100k elements inserted. Then you start to delete elements from the map. Lets say you have deleted half of elements, then, you expect half of memory being freed. Right? Wrong! I see that memory is released when number of elements in map pass some threshold, in my case it was 1443 elements.
One may say that it is malloc optimization to allocate large chunk from OS using VirtualAllocEx or HeapAlloc and actually it is not freeing memory back to system since the optimization dictate the policy and may not call HeapFree for future reuse of already allocated memory.
To eliminate this case I've employed custom allocator for allocate_shared, it didnt do the trick. So the main question is why it happens and what can be done to "compact" memory used by unordered_map?
The code

#include <windows.h>
#include <memory>
#include <vector>
#include <map>
#include <unordered_map>
#include <random>
#include <thread>
#include <iostream>
#include <allocators>

HANDLE heap = HeapCreate(0, 0, 0);
template <class Tp>
struct SimpleAllocator
{
    typedef Tp value_type;
    SimpleAllocator() noexcept
    {}
    template <typename U>
    SimpleAllocator(const SimpleAllocator<U>& other) throw()
    {};
    Tp* allocate(std::size_t n)
    {
        return static_cast<Tp*>(HeapAlloc(heap, 0, n * sizeof(Tp)));
    }
    void deallocate(Tp* p, std::size_t n)
    {
        HeapFree(heap, 0, p);
    }
};
template <class T, class U>
bool operator==(const SimpleAllocator<T>&, const SimpleAllocator<U>&)
{
    return true;
}
template <class T, class U>
bool operator!=(const SimpleAllocator<T>& a, const SimpleAllocator<U>& b)
{
    return !(a == b);
}

struct Entity
{
    Entity()
    {
        _6 = std::string("a", dis(gen));
        _7 = std::string("b", dis(gen));
        for(size_t i = 0; i < dis(gen); ++i)
        {
            _9.emplace(i, std::string("c", dis(gen)));
        }
    }
    int _1 = 1;
    int _2 = 2;
    double _3 = 3;
    double _4 = 5;
    float _5 = 3.14f;
    std::string _6 = "hello world!";
    std::string _7 = "A quick brown fox jumps over the lazy dog.";
    std::vector<unsigned long long> _8 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0};
    std::map<long long, std::string> _9 = {{0, "a"},{1, "b"},{2, "c"},{3, "d"},{4, "e"},
    {5, "f"},{6, "g"},{7, "h"},{8, "e"},{9, "j"}};
    std::vector<double> _10{1000, 3.14};
    std::random_device rd;
    std::mt19937 gen = std::mt19937(rd());
    std::uniform_int_distribution<size_t> dis = std::uniform_int_distribution<size_t>(16, 256);
};

using Container = std::unordered_map<long long, std::shared_ptr<Entity>>;

void printContainerInfo(std::shared_ptr<Container> container)
{
    std::cout << std::chrono::system_clock::to_time_t(std::chrono::system_clock::now())
        << ", Size: " << container->size() << ", Bucket count: " << container->bucket_count()
        << ", Load factor: " << container->load_factor() << ", Max load factor: " << container->max_load_factor()
        << std::endl;
}

int main()
{
    constexpr size_t maxEntites = 100'000;
    constexpr size_t ps = 10'000;
    stdext::allocators::allocator_chunklist<Entity> _allocator;
    std::shared_ptr<Container> test = std::make_shared<Container>();
    test->reserve(maxEntites);

    for(size_t i = 0; i < maxEntites; ++i)
    {
        test->emplace(i, std::make_shared<Entity>());
    }

    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<size_t> dis(0, maxEntites);
    size_t cycles = 0;
    while(test->size() > 0)
    {
        size_t counter = 0;
        std::cout << "Press any key..." << std::endl;
        std::cin.get();
        while(test->size() > 1443)
        {
            test->erase(dis(gen));
        }
        printContainerInfo(test);
        std::cout << "Press any key..." << std::endl;
        std::cin.get();
        std::cout << std::endl;
    }
    return 0;
}

Things I tried so far: Try to rehash/resize when the load factor reaches some threshold - in the erasing while add something like this

if(test->load_factor() < 0.2)
{
    test->max_load_factor(1 / test->load_factor());
    test->rehash(test->size());
    test->reserve(test->size());
    printContainerInfo(test);
    test->max_load_factor(1);
    test->rehash(test->size());
    test->reserve(test->size());
}

Then when it doesnt help try something silly like creating temporary container, copying/moving remaining entries, clear the original one, and copy/move back from temp to the original. Something like this

if(test->load_factor() < 0.2)
{
    Container tmp;
    std::copy(test->begin(), test->end(), std::inserter(tmp, tmp.begin()));
    test->clear();
    test.reset();
    test = std::make_shared<Container>();
    std::copy(tmp.begin(), tmp.end(), std::inserter(*test, test->begin()));
}

Finally, replace the shared_ptr with allocate_shared and pass the SimpleAllocator instance to it.
In addition, I've modified STL code here and there, like calling std::vector::shrink_to_fit on std::unordered_map's vector (msvc stl implementation of unordered_map is based on list and vector), it didnt work either.

EDIT001: For all non-believers. The following code does more or less the same as previous code but uses std::vector<Entity> instead of unordered_map. The memory is reclaimed by OS.

#include <memory>
#include <vector>
#include <map>
#include <random>
#include <thread>
#include <iostream>

struct Entity
{
    Entity()
    {
        _6 = std::string("a", dis(gen));
        _7 = std::string("b", dis(gen));
        for(size_t i = 0; i < dis(gen); ++i)
        {
            _9.emplace(i, std::string("c", dis(gen)));
        }
    }
    int _1 = 1;
    int _2 = 2;
    double _3 = 3;
    double _4 = 5;
    float _5 = 3.14f;
    std::string _6 = "hello world!";
    std::string _7 = "A quick brown fox jumps over the lazy dog.";
    std::vector<unsigned long long> _8 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0};
    std::map<long long, std::string> _9 = {{0, "a"}, {1, "b"}, {2, "c"}, {3, "d"}, {4, "e"},
                                           {5, "f"}, {6, "g"}, {7, "h"}, {8, "e"}, {9, "j"}};
    std::vector<double> _10{1000, 3.14};
    std::random_device rd;
    std::mt19937 gen = std::mt19937(rd());
    std::uniform_int_distribution<size_t> dis = std::uniform_int_distribution<size_t>(16, 256);
};

using Container = std::vector<std::shared_ptr<Entity>>;

void printContainerInfo(std::shared_ptr<Container> container)
{
    std::cout << std::chrono::system_clock::to_time_t(std::chrono::system_clock::now())
              << ", Size: " << container->size() << ", Capacity: " << container->capacity() << std::endl;
}

int main()
{
    constexpr size_t maxEntites = 100'000;
    constexpr size_t ps = 10'000;
    std::shared_ptr<Container> test = std::make_shared<Container>();
    test->reserve(maxEntites);

    for(size_t i = 0; i < maxEntites; ++i)
    {
        test->emplace_back(std::make_shared<Entity>());
    }

    std::random_device rd;
    std::mt19937 gen(rd());
    size_t cycles = 0;
    while(test->size() > 0)
    {
        std::uniform_int_distribution<size_t> dis(0, test->size());
        size_t counter = 0;
        while(test->size() > 0 && counter < ps)
        {
            test->pop_back();
            ++counter;
        }
        ++cycles;
        if(cycles % 7 == 0)
        {
            std::cout << "Inflating..." << std::endl;
            while(test->size() < maxEntites)
            {
                test->emplace_back(std::make_shared<Entity>());
            }
        }
        std::this_thread::sleep_for(std::chrono::seconds(1));
        printContainerInfo(test);
        std::cout << std::endl;
    }
    return 0;
}
kreuzerkrieg
  • 3,009
  • 3
  • 28
  • 59
  • How do you know memory is not freed? – n. m. could be an AI Sep 04 '16 at 05:53
  • looking at commit size in task manager or "total memory" in RAMMap by Sysinternals – kreuzerkrieg Sep 04 '16 at 05:55
  • 1
    @kreuzerkrieg Freed memory isn't actually returned to the OS from running process. You wont be able to see that in the task manager. Use a tool like valgrind or such to detect memory leaks. – πάντα ῥεῖ Sep 04 '16 at 05:58
  • 1
    a) there is no memory leak b) so, as you say, if I replace `unordered_map` with `vector` I will see the same behavior? wrong! memory will be released to OS immediately. Moreover, if your claim is true, `clean` would not release memory, but it does, in addition, when the number of items in `unordered_map` fails under ~1400 items, why the memory is reclaimed by OS? – kreuzerkrieg Sep 04 '16 at 10:37
  • @πάνταῥεῖ ῥεῖ, how come it is a duplicate for the question which asked for Linux and glibc, with all the arena stuff involved? – kreuzerkrieg Sep 04 '16 at 10:48
  • @kreuzerkrieg _"looking at commit size in task manager or "total memory" in RAMMap by Sysinternals"_ – πάντα ῥεῖ Sep 04 '16 at 10:50
  • @kreuzerkrieg Use some appropriate tool to watch memory allocation/deallocation. The task manager just fools you. – πάντα ῥεῖ Sep 04 '16 at 10:52
  • @kreuzerkrieg Do you expect Windows to behave any differently from Linux in this regard? – n. m. could be an AI Sep 04 '16 at 14:26
  • @n.m. do you expect CRTs from different OSes, implemented by different people to behave the same? Not a CRT specialist, but I dont remember MSVCRT implementing per thread arenas like glibc does – kreuzerkrieg Sep 04 '16 at 15:28
  • @kreuzerkrieg I absolutely would. (In)feasibility of returning memory back to the OS does not depend on how exactly `malloc` is built, but only on memory-related services each OS presents. Since they are nearly identical for all modern OSes, there should be no reason why CTRs built by any people for any OS should not follow the same strategy of never returning memory to the OS. – n. m. could be an AI Sep 04 '16 at 17:32
  • @n.m. Agree that overall strategy is the same for all modern OSes, but I expect that the "never" will be at different level of "neverness", since it is simply not correct, at least for Windows – kreuzerkrieg Sep 04 '16 at 19:32
  • Perhaps you should measure the actual memory usage of the unordered_map using your allocator instead. – Daniel Jour Sep 04 '16 at 20:56
  • I have never seen malloc and friends returning memory back to Windows, I would be very interested to know how and when this might happen. Total memory allocated by the process may shrink because of an explicit request to unmap memory, not because of freeing some malloc'd memory (IME). – n. m. could be an AI Sep 04 '16 at 21:55
  • @n.m. I've edited the question with an example based on previous code but with `vector` instead of `unordered_map`. You are welcome to run it on your machine and report back with your observations – kreuzerkrieg Sep 05 '16 at 04:37
  • You are right, MSVC releases memory to the OS. But it does so for your unordered_map example too. What is your question exactly? – n. m. could be an AI Sep 05 '16 at 05:58
  • It does not releases for me if `unordered_map` employed. Please change the `while(test->size() > 1443)` to something like `while(test->size() > 10000)` and lets see it it releases memory. in my case it didnt, and the 1443 is the threshold when it starts to release. BTW, what version of MSVC you are using? – kreuzerkrieg Sep 05 '16 at 06:32

3 Answers3

9

You are correct, but partially.

The way C++ unordered_map is implemented in VC++ is by using an internal std::vector, which is the the bucket list, and a std::list which holds the nodes of the map.

In a diagram, it looks like this:

buckets : [][][*][][][][][][*][][][][][][*]
               |            |            |
               |            |            | 
             ---             ------      |
             |                    |      |
             V                    V      V
elements: [1,3]->[5,7]->[7,1]->[8,11]->[10,3]->[-1,2]

Now, as you erase nodes, they are actually removed from the list, but it says nothing about the buckets array. The buckets array is resized after some threshold is achieved (either by having too many elements per bucket, or having too many buckets for the number of elements).

Too prove my point, here is an example compiled with the latest VC++:

std::unordered_map<int, std::vector<char>> map;
for (auto i = 0; i < 1000; i++) {
    map.emplace(i, std::vector<char>(10000));
}

for (auto i = 0; i < 900; i++) {
    map.erase(i);
}

Looking at the raw view in the debugger, we see:

+       _List   { size=100 }    std::list<std::pair<int const ,std::vector<char,std::allocator<char> > >,std::allocator<std::pair<int const ,std::vector<char,std::allocator<char> > > > >
+       _Vec    { size=2048 }   std::vector<std::_List_unchecked_iterator<std::_List_val<std::_List_simple_types<std::pair<int const ,std::vector<char,std::allocator<char> > > > > >,std::_Wrap_alloc<std::allocator<std::_List_unchecked_iterator<std::_List_val<std::_List_simple_types<std::pair<int const ,std::vector<char,std::allocator<char> > > > > > > > >

meaning that although we only have 100 elements, the map retained 2048 buckets.

So, not all the memory is released when you delete elements. The map maintains another section of memory to book-keep the buckets themselves, and that memory is more stubborn than the elements memory.

EDIT:
Let's go even wilder!

std::unordered_map<int, std::vector<char>> map;
for (auto i = 0; i < 100'000; i++) {
    map.emplace(i, std::vector<char>(10000));
}

for (auto i = 0; i < 90'000; i++) {
    map.erase(i);
}

The results at the end of erasing loop:

+       _List   { size=10000 }  std::list<std::pair<int const ,std::vector<char,std::allocator<char> > >,std::allocator<std::pair<int const ,std::vector<char,std::allocator<char> > > > >
+       _Vec    { size=262144 } std::vector<std::_List_unchecked_iterator<std::_List_val<std::_List_simple_types<std::pair<int const ,std::vector<char,std::allocator<char> > > > > >,std::_Wrap_alloc<std::allocator<std::_List_unchecked_iterator<std::_List_val<std::_List_simple_types<std::pair<int const ,std::vector<char,std::allocator<char> > > > > > > > >

Now, on 64 bit, the size of std::_List_unchecked_iterator<...> is 8 bytes. We have 262144 of them so we hold 262144*8/(1024*1024) = 2MB of pretty much unused data. This is the high memory usage you see.

Calling map.rehash(1024*10) after all the excess nodes have been removed seems to help with the memory consumption:

+       _List   { size=10000 }  std::list<std::pair<int const ,std::vector<char,std::allocator<char> > >,std::allocator<std::pair<int const ,std::vector<char,std::allocator<char> > > > >
+       _Vec    { size=32768 }  std::vector<std::_List_unchecked_iterator<std::_List_val<std::_List_simple_types<std::pair<int const ,std::vector<char,std::allocator<char> > > > > >,std::_Wrap_alloc<std::allocator<std::_List_unchecked_iterator<std::_List_val<std::_List_simple_types<std::pair<int const ,std::vector<char,std::allocator<char> > > > > > > > >

This is the solution you were looking for.

(PS: I have doing a lot of .NET lately against my will. This question really shows the good parts about C++: we can step into the standard libraries code with our debugger, see exactly how and when things happen and we can act upon them subsequently. Doing such thing in .NET would have been a living hell, if even possible.)

Kevin
  • 16,549
  • 8
  • 60
  • 74
David Haim
  • 25,446
  • 3
  • 44
  • 78
  • David, as was stated in my question, I've modified the STL unordered_map implementation to shrink_to_fit the bucket version, it doesnt work. Except this, your post 100% correct in describing `unordered_map` implementation – kreuzerkrieg Sep 04 '16 at 10:39
  • @kreuzerkrieg `shrink_to_fit` has nothing to do with your question as the buckets vector actually uses the memory. `shrink_to_fit` removes excess capacity from the end of the vector. in our example, shrink to fit will not help in the case that the vector has 2048 buckets for only 100 elements. – David Haim Sep 05 '16 at 08:07
  • its all about `shrink_to_fit`, you got wrong impression that rehashing changes anything. it does in terms of size but not capacity. set a breakpoint in `reshash` function and then in `_Init(_Newsize);` you will see that it just resizes and reserves the vector, which both does not affect internal capacity of the `vector` unless new size is greater than previously allocated. so actually you are wasting all these pointer which fill the `capacity` – kreuzerkrieg Sep 05 '16 at 09:15
  • in addition, you have salvaged 2Mb... did you see the app memory footprint? it should be ~1Gb, right? how much memory it occupied after you rehashed? 1Gb - 2Mb? we are not amused :) – kreuzerkrieg Sep 05 '16 at 09:18
  • @kreuzerkrieg 10000*10000/(1024*1024) = 95MB. besides, if your elemetns are alive, you can't expect their memory to not be visible. plus, the buckets array multiply itself by 2 each time, I can only guess that if you put huge amount of elements (few millions) the memory consumption will be much noticable – David Haim Sep 05 '16 at 09:25
  • how come? you put 100k elements of size 10k, it should be 953Mb. in addition, my elements are not alive, it is `shared_ptr`, reference count increased to 1 when I emplace it in the container and decreases to zero when erased from container, then the `shared_ptr's` deleter should kick in – kreuzerkrieg Sep 05 '16 at 09:41
  • @kreuzerkrieg both your code and mine are erasing elements of the map. when you erase an element, you destruct it. the whole point of this answer is to tell you that event though the elements themselves are beign deleted and their memory is freed, the bucket array remains unchanged. at the end of your code, there are only 1443 alive objects. bu the bucket count is probably higher – David Haim Sep 05 '16 at 09:55
3

Lets say you have deleted half of elements, then, you expect half of memory being freed. Right?

Actually no. I would expect the memory allocator to be written in terms of efficiency of execution of my program. I would expect it to allocate more memory than it needs and release that memory back to the OS only when ordered to or when it's sure that the memory will never be needed again.

I would expect memory blocks to be re-used in user-space as often as possible, and that they were allocated in contiguous blocks.

For most applications, a pedantic memory allocator which allocated memory from the OS and returned it the moment the object was destroyed would result in horribly slow programs and a great deal of disk thrashing. It would also (in practice) mean that on all popular operating systems, even the tiniest 40-byte string would be allocated its own 4k page, since the intel chipset can only process-protect memory in pages this size (or maybe bigger on some systems?)

Richard Hodges
  • 68,278
  • 7
  • 90
  • 142
  • Correct, this is how it supposed to work, but still you expect from server application which runs 24/7 from time to time to release memory, since there is no growth is data working set, it cannot grow forever. AFAIR, in MSVC `malloc` does the magic for allocating chunks and managing sub-chunks of memory, which contradicts what I observe when I use custom allocator which uses direct OS calls for allocation/free memory, yet inefficient it was written to check if `malloc` is the one to blame – kreuzerkrieg Sep 04 '16 at 10:45
  • @kreuzerkrieg there's no real need to release memory back to the OS in a memory-managed system. What you're really allocating is virtual pages. They will be swapped out by the OS when not required. The only real resource you're consuming is disk space. – Richard Hodges Sep 04 '16 at 14:04
  • and then you will hit page fault and bang your latency is skyrocketing, this is not what you want in server application. OS stuff aside, why the memory is released back to OS if you use `vector` instead of `unordered_map` and start to pop elements from it? – kreuzerkrieg Sep 04 '16 at 15:34
2

Ok, after opening premium support ticket to Microsoft I got following answer. Most of it we already knew, but there is on piece we didnt consider.

  1. In windows memory is allocated in the heap in form of Pages
  2. In the STL there is no caching whatsoever, we straightway call RtlHeapFree after you call erase
  3. What you see is how Windows manages the Heap
  4. Once you mark something for deletion it may not be returned to the OS where there is no memory pressure, it may decide that the cost of reallocating the memory in future is more than keeping it in the process
  5. This is how any Heap algorithms work
  6. Another thing to consider is that; if the values that you are deleting happens to spread out across pages; and unless all the values inside the page are empty it will be resident in the memory
  7. If you are extremely particular about immediate reduction of your private bytes you may have to write your own memory manager and not depend on Windows Heap Handle.

Emphasis is mine. I guess it answers the question, or the question is as simple as "this is the way Windows heap management works". In any case there is no (simple) solution for this problem, maybe it is better to use something like boost::intrusive containers which theoretically supposed to provide better locality so the Windows memory manager has a better chance to return memory to OS.

UPDATE001: Boost intrusive container didnt do the trick too.

struct Entity : public boost::intrusive::unordered_set_base_hook<>
{
    explicit Entity(size_t id)
    {
        first = id;
        _6 = std::string("a", dis(gen));
        _7 = std::string("b", dis(gen));
        for(size_t i = 0; i < dis(gen); ++i)
        {
            _9.emplace(i, std::string("c", dis(gen)));
        }
    }

    size_t first = 1;
    int _1 = 1;
    int _2 = 2;
    float _5 = 3.14f;
    double _3 = 3;
    double _4 = 5;
    std::string _6 = "hello world!";
    std::string _7 = "A quick brown fox jumps over the lazy dog.";
    std::vector<unsigned long long> _8 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0};
    std::map<long long, std::string> _9 = {{0, "a"}, {1, "b"}, {2, "c"}, {3, "d"}, {4, "e"},
                                           {5, "f"}, {6, "g"}, {7, "h"}, {8, "e"}, {9, "j"}};
    std::vector<double> _10{1000, 3.14};
    std::random_device rd;
    std::mt19937 gen = std::mt19937(rd());
    std::uniform_int_distribution<size_t> dis = std::uniform_int_distribution<size_t>(16, 256);
};

struct first_is_key
{
    typedef size_t type;

    const type& operator()(const Entity& v) const { return v.first; }
};

using Container = boost::intrusive::unordered_set<Entity, boost::intrusive::key_of_value<first_is_key>>;

void printContainerInfo(const Container& container)
{
    std::cout << std::chrono::system_clock::to_time_t(std::chrono::system_clock::now())
              << ", Size: " << container.size() << ", Bucket count: " << container.bucket_count() << std::endl;
}

int main()
{
    constexpr size_t maxEntites = 100'000;
    Container::bucket_type* base_buckets = new Container::bucket_type[maxEntites];
    Container test(Container::bucket_traits(base_buckets, maxEntites));

    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<size_t> dis;

    while(test.size() < maxEntites)
    {
        auto data = new Entity(dis(gen));
        auto res = test.insert(*data);
        if(!res.second)
        {
            delete data;
        }
    }

    printContainerInfo(test);
    while(test.size() > 0)
    {
        while(test.size() > maxEntites * 2 / 3)
        {
            test.erase_and_dispose(test.begin(), [](Entity* entity)
                                   {
                                       delete entity;
                                   });
        }

        printContainerInfo(test);
        while(test.size() < maxEntites)
        {
            auto data = new Entity(dis(gen));
            auto res = test.insert(*data);
            if(!res.second)
            {
                delete data;
            }
        }
    }
    return 0;
}
kreuzerkrieg
  • 3,009
  • 3
  • 28
  • 59