I want to allocate about 40 GB on RAM. My first try was:
#include <iostream>
#include <ctime>
int main(int argc, char** argv)
{
unsigned long long ARRAYSIZE = 20ULL * 1024ULL * 1024ULL * 1024ULL;
unsigned __int16 *myBuff = new unsigned __int16[ARRAYSIZE]; // 3GB/s 40GB / 13.7 s
unsigned long long i = 0;
const clock_t begintime = clock();
for (i = 0; i < ARRAYSIZE; ++i){
myBuff[i] = 0;
}
std::cout << "finish: " << float(clock() - begintime) / CLOCKS_PER_SEC << std::endl;
std::cin.get();
delete [] myBuff;
return 0;
}
The memory write speed was about 3 GB/s that was not satisfactory for my high performance system.
So I tried Intel Cilk Plus as below:
/*
nworkers = 5; 8.5 s ==> 4.7 GB/s
nworkers = 8; 8.2 s ==> 4.8 GB/s
nworkers = 10; 9 s ==> 4.5 GB/s
nworkers = 32; 15 s ==> 2.6 GB/s
*/
#include "cilk\cilk.h"
#include "cilk\cilk_api.h"
#include <iostream>
#include <ctime>
int main(int argc, char** argv)
{
unsigned long long ARRAYSIZE = 20ULL * 1024ULL * 1024ULL * 1024ULL;
unsigned __int16 *myBuff = new unsigned __int16[ARRAYSIZE];
if (0 != __cilkrts_set_param("nworkers", "32")){
std::cout << "Error" << std::endl;
}
const clock_t begintime = clock();
cilk_for(long long j = 0; j < ARRAYSIZE; ++j){
myBuff[j] = 0;
}
std::cout << "finish: " << float(clock() - begintime) / CLOCKS_PER_SEC << std::endl;
std::cin.get();
delete [] myBuff;
return 0;
}
The results are commented above the code. As it can be seen, there is speed up for nworkers = 8. But the larger nworkers, the slower allocating. I thought maybe it was due to locking by threads. So I tried scalable allocator provided by Intel TBB as:
#include "tbb\task_scheduler_init.h"
#include "tbb\blocked_range.h"
#include "tbb\parallel_for.h"
#include "tbb\scalable_allocator.h"
#include "cilk\cilk.h"
#include "cilk\cilk_api.h"
#include <iostream>
#include <ctime>
// No retry loop because we assume that scalable_malloc does
// all it takes to allocate the memory, so calling it repeatedly
// will not improve the situation at all
//
// No use of std::new_handler because it cannot be done in portable
// and thread-safe way (see sidebar)
//
// We throw std::bad_alloc() when scalable_malloc returns NULL
//(we return NULL if it is a no-throw implementation)
void* operator new (size_t size) throw (std::bad_alloc)
{
if (size == 0) size = 1;
if (void* ptr = scalable_malloc(size))
return ptr;
throw std::bad_alloc();
}
void* operator new[](size_t size) throw (std::bad_alloc)
{
return operator new (size);
}
void* operator new (size_t size, const std::nothrow_t&) throw ()
{
if (size == 0) size = 1;
if (void* ptr = scalable_malloc(size))
return ptr;
return NULL;
}
void* operator new[](size_t size, const std::nothrow_t&) throw ()
{
return operator new (size, std::nothrow);
}
void operator delete (void* ptr) throw ()
{
if (ptr != 0) scalable_free(ptr);
}
void operator delete[](void* ptr) throw ()
{
operator delete (ptr);
}
void operator delete (void* ptr, const std::nothrow_t&) throw ()
{
if (ptr != 0) scalable_free(ptr);
}
void operator delete[](void* ptr, const std::nothrow_t&) throw ()
{
operator delete (ptr, std::nothrow);
}
int main(int argc, char** argv)
{
unsigned long long ARRAYSIZE = 20ULL * 1024ULL * 1024ULL * 1024ULL;
tbb::task_scheduler_init tbb_init;
unsigned __int16 *myBuff = new unsigned __int16[ARRAYSIZE];
if (0 != __cilkrts_set_param("nworkers", "10")){
std::cout << "Error" << std::endl;
}
const clock_t begintime = clock();
cilk_for(long long j = 0; j < ARRAYSIZE; ++j){
myBuff[j] = 0;
}
std::cout << "finish: " << float(clock() - begintime) / CLOCKS_PER_SEC << std::endl;
std::cin.get();
delete [] myBuff;
return 0;
}
(Above code is adapted from Intel TBB book by James Reinders, O'REILLY) But results are almost identical to the previous try. I set TBB_VERSION environment variable to see if I really use Scalable_malloc and the got information is in this picture (nworkers = 32):
https://www.dropbox.com/s/y1vril3f19mkf66/TBB_Info.png?dl=0
I am willing to know what is wrong whit my code. I expect memory write speed to be at least about 40 GB/s.
How should I use scalable allocator correctly?
Can somebody please present a simple verified example of using scalable allocator from INTEL TBB?
Environment: Intel Xeon CPU E5-2690 0 @ 2.90 GHz (2 processors), 224 GB RAM (2 * 7 * 16 GB) DDR3 1600 MHz, Windows server 2008 R2 Datacenter, Microsoft visual studio 2013 and Intel C++ compiler 2017.