MSVC has a built-in profiler. We can run it (press Alt-F2) to see that the majority of CPU time is spent inside the constructor and destructor, which invoke deque::resize()
and deque::_Tidy()
functions, respectively.

If we drill down further we see that deque::emplace_back()
is resulting in quite a lot of code
#define _PUSH_BACK_BEGIN \
if ((_Myoff() + _Mysize()) % _DEQUESIZ == 0 && _Mapsize() <= (_Mysize() + _DEQUESIZ) / _DEQUESIZ) { \
_Growmap(1); \
} \
_Myoff() &= _Mapsize() * _DEQUESIZ - 1; \
size_type _Newoff = _Myoff() + _Mysize(); \
size_type _Block = _Getblock(_Newoff); \
if (_Map()[_Block] == nullptr) { \
_Map()[_Block] = _Getal().allocate(_DEQUESIZ); \
}
#define _PUSH_BACK_END ++_Mysize()
template <class... _Valty>
decltype(auto) emplace_back(_Valty&&... _Val) {
_Orphan_all();
_PUSH_BACK_BEGIN;
_Alty_traits::construct(
_Getal(), _Unfancy(_Map()[_Block] + _Newoff % _DEQUESIZ), _STD forward<_Valty>(_Val)...);
_PUSH_BACK_END;
#if _HAS_CXX17
return back();
#endif // _HAS_CXX17
}
Disassembly view:
template <class... _Valty>
decltype(auto) emplace_back(_Valty&&... _Val) {
00007FF674A238E0 mov qword ptr [rsp+8],rcx
00007FF674A238E5 push rbp
00007FF674A238E6 push rdi
00007FF674A238E7 sub rsp,138h
00007FF674A238EE lea rbp,[rsp+20h]
00007FF674A238F3 mov rdi,rsp
00007FF674A238F6 mov ecx,4Eh
00007FF674A238FB mov eax,0CCCCCCCCh
00007FF674A23900 rep stos dword ptr [rdi]
00007FF674A23902 mov rcx,qword ptr [rsp+158h]
00007FF674A2390A lea rcx,[__0657B1E2_deque (07FF674A3E02Fh)]
00007FF674A23911 call __CheckForDebuggerJustMyCode (07FF674A21159h)
_Orphan_all();
00007FF674A23916 mov rcx,qword ptr [this]
00007FF674A2391D call std::deque<double,std::allocator<double> >::_Orphan_all (07FF674A217FDh)
_PUSH_BACK_BEGIN;
00007FF674A23922 mov rcx,qword ptr [this]
00007FF674A23929 call std::deque<double,std::allocator<double> >::_Myoff (07FF674A2139Dh)
00007FF674A2392E mov qword ptr [rbp+0F8h],rax
00007FF674A23935 mov rcx,qword ptr [this]
00007FF674A2393C call std::deque<double,std::allocator<double> >::_Mysize (07FF674A211B8h)
00007FF674A23941 mov rcx,qword ptr [rbp+0F8h]
00007FF674A23948 mov rcx,qword ptr [rcx]
00007FF674A2394B add rcx,qword ptr [rax]
00007FF674A2394E mov rax,rcx
00007FF674A23951 xor edx,edx
00007FF674A23953 mov ecx,2
00007FF674A23958 div rax,rcx
00007FF674A2395B mov rax,rdx
00007FF674A2395E test rax,rax
00007FF674A23961 jne std::deque<double,std::allocator<double> >::emplace_back<>+0D0h (07FF674A239B0h)
00007FF674A23963 mov rcx,qword ptr [this]
00007FF674A2396A call std::deque<double,std::allocator<double> >::_Mapsize (07FF674A214BFh)
00007FF674A2396F mov qword ptr [rbp+0F8h],rax
00007FF674A23976 mov rcx,qword ptr [this]
00007FF674A2397D call std::deque<double,std::allocator<double> >::_Mysize (07FF674A211B8h)
00007FF674A23982 mov rax,qword ptr [rax]
00007FF674A23985 add rax,2
00007FF674A23989 xor edx,edx
00007FF674A2398B mov ecx,2
00007FF674A23990 div rax,rcx
00007FF674A23993 mov rcx,qword ptr [rbp+0F8h]
00007FF674A2399A cmp qword ptr [rcx],rax
00007FF674A2399D ja std::deque<double,std::allocator<double> >::emplace_back<>+0D0h (07FF674A239B0h)
00007FF674A2399F mov edx,1
00007FF674A239A4 mov rcx,qword ptr [this]
00007FF674A239AB call std::deque<double,std::allocator<double> >::_Growmap (07FF674A21640h)
00007FF674A239B0 mov rcx,qword ptr [this]
00007FF674A239B7 call std::deque<double,std::allocator<double> >::_Mapsize (07FF674A214BFh)
00007FF674A239BC mov rax,qword ptr [rax]
00007FF674A239BF lea rax,[rax+rax-1]
00007FF674A239C4 mov qword ptr [rbp+0F8h],rax
00007FF674A239CB mov rcx,qword ptr [this]
00007FF674A239D2 call std::deque<double,std::allocator<double> >::_Myoff (07FF674A2139Dh)
00007FF674A239D7 mov qword ptr [rbp+100h],rax
00007FF674A239DE mov rax,qword ptr [rbp+100h]
00007FF674A239E5 mov rax,qword ptr [rax]
00007FF674A239E8 mov qword ptr [rbp+108h],rax
00007FF674A239EF mov rax,qword ptr [rbp+0F8h]
00007FF674A239F6 mov rcx,qword ptr [rbp+108h]
00007FF674A239FD and rcx,rax
00007FF674A23A00 mov rax,rcx
00007FF674A23A03 mov rcx,qword ptr [rbp+100h]
00007FF674A23A0A mov qword ptr [rcx],rax
00007FF674A23A0D mov rcx,qword ptr [this]
00007FF674A23A14 call std::deque<double,std::allocator<double> >::_Myoff (07FF674A2139Dh)
00007FF674A23A19 mov qword ptr [rbp+0F8h],rax
00007FF674A23A20 mov rcx,qword ptr [this]
00007FF674A23A27 call std::deque<double,std::allocator<double> >::_Mysize (07FF674A211B8h)
00007FF674A23A2C mov rcx,qword ptr [rbp+0F8h]
00007FF674A23A33 mov rcx,qword ptr [rcx]
00007FF674A23A36 add rcx,qword ptr [rax]
00007FF674A23A39 mov rax,rcx
00007FF674A23A3C mov qword ptr [_Newoff],rax
00007FF674A23A40 mov rdx,qword ptr [_Newoff]
00007FF674A23A44 mov rcx,qword ptr [this]
00007FF674A23A4B call std::deque<double,std::allocator<double> >::_Getblock (07FF674A21334h)
00007FF674A23A50 mov qword ptr [_Block],rax
00007FF674A23A54 mov rcx,qword ptr [this]
00007FF674A23A5B call std::deque<double,std::allocator<double> >::_Map (07FF674A21753h)
00007FF674A23A60 mov rax,qword ptr [rax]
00007FF674A23A63 mov rcx,qword ptr [_Block]
00007FF674A23A67 cmp qword ptr [rax+rcx*8],0
00007FF674A23A6C jne std::deque<double,std::allocator<double> >::emplace_back<>+1D7h (07FF674A23AB7h)
00007FF674A23A6E mov rcx,qword ptr [this]
00007FF674A23A75 call std::deque<double,std::allocator<double> >::_Getal (07FF674A216CCh)
00007FF674A23A7A mov qword ptr [rbp+0F8h],rax
00007FF674A23A81 mov edx,2
00007FF674A23A86 mov rcx,qword ptr [rbp+0F8h]
00007FF674A23A8D call std::allocator<double>::allocate (07FF674A216C7h)
00007FF674A23A92 mov qword ptr [rbp+100h],rax
00007FF674A23A99 mov rcx,qword ptr [this]
00007FF674A23AA0 call std::deque<double,std::allocator<double> >::_Map (07FF674A21753h)
00007FF674A23AA5 mov rax,qword ptr [rax]
00007FF674A23AA8 mov rcx,qword ptr [_Block]
00007FF674A23AAC mov rdx,qword ptr [rbp+100h]
00007FF674A23AB3 mov qword ptr [rax+rcx*8],rdx
_Alty_traits::construct(
00007FF674A23AB7 mov rcx,qword ptr [this]
00007FF674A23ABE call std::deque<double,std::allocator<double> >::_Map (07FF674A21753h)
00007FF674A23AC3 mov rax,qword ptr [rax]
00007FF674A23AC6 mov qword ptr [rbp+0F8h],rax
00007FF674A23ACD xor edx,edx
00007FF674A23ACF mov rax,qword ptr [_Newoff]
00007FF674A23AD3 mov ecx,2
00007FF674A23AD8 div rax,rcx
00007FF674A23ADB mov rax,rdx
00007FF674A23ADE mov rcx,qword ptr [_Block]
00007FF674A23AE2 mov rdx,qword ptr [rbp+0F8h]
00007FF674A23AE9 mov rcx,qword ptr [rdx+rcx*8]
00007FF674A23AED lea rax,[rcx+rax*8]
00007FF674A23AF1 mov rcx,rax
00007FF674A23AF4 call std::_Unfancy<double> (07FF674A214A6h)
00007FF674A23AF9 mov qword ptr [rbp+100h],rax
00007FF674A23B00 mov rcx,qword ptr [this]
00007FF674A23B07 call std::deque<double,std::allocator<double> >::_Getal (07FF674A216CCh)
00007FF674A23B0C mov qword ptr [rbp+108h],rax
00007FF674A23B13 mov rdx,qword ptr [rbp+100h]
00007FF674A23B1A mov rcx,qword ptr [rbp+108h]
00007FF674A23B21 call std::_Default_allocator_traits<std::allocator<double> >::construct<double> (07FF674A211E5h)
_Getal(), _Unfancy(_Map()[_Block] + _Newoff % _DEQUESIZ), _STD forward<_Valty>(_Val)...);
_PUSH_BACK_END;
00007FF674A23B26 mov rcx,qword ptr [this]
00007FF674A23B2D call std::deque<double,std::allocator<double> >::_Mysize (07FF674A211B8h)
00007FF674A23B32 mov qword ptr [rbp+0F8h],rax
00007FF674A23B39 mov rax,qword ptr [rbp+0F8h]
00007FF674A23B40 mov rax,qword ptr [rax]
00007FF674A23B43 inc rax
00007FF674A23B46 mov rcx,qword ptr [rbp+0F8h]
00007FF674A23B4D mov qword ptr [rcx],rax
#if _HAS_CXX17
return back();
00007FF674A23B50 mov rcx,qword ptr [this]
00007FF674A23B57 call std::deque<double,std::allocator<double> >::back (07FF674A2127Bh)
#endif // _HAS_CXX17
}
00007FF674A23B5C lea rsp,[rbp+118h]
00007FF674A23B63 pop rdi
00007FF674A23B64 pop rbp
00007FF674A23B65 ret
Apparently std::deque
doesn't pre-allocate elements, and instead uses a loop to add them one-by-one. So no wonder it is slow.
You can speed up the Debug build by enabling some optimizations (e.g. /Ob1
) and reducing runtime checks (e.g. remove /RTC1
).
But really, std::deque
is just a horrible structure from a performance point of view (a vector of tiny vectors - not cache-friendly at all).