2

I'm performing some simple tests concerning remote communications in HPX, compiled such that parcelports are based on MPI. I'm facing some issues about bandwidth and latency of communications.

The test is performed by this simple code:

#include <iostream>
#include <chrono>
#include <hpx/hpx_main.hpp>
#include <hpx/include/components.hpp>
#include <hpx/include/actions.hpp>
#include <hpx/include/iostreams.hpp>

class block
      : public hpx::components::component_base<block>
    {
    public:

    block(std::size_t size) : data_(size,1){ } 
    std::vector<double> get_data(){return data_;}
    int pingpong(){return 1;}

    HPX_DEFINE_COMPONENT_ACTION(block, get_data, get_data_action);
    HPX_DEFINE_COMPONENT_ACTION(block, pingpong, pingpong_action);

    private:    
    std::vector<double> data_; 

   };

typedef hpx::components::component<block>   block_type;
typedef block::get_data_action          block__get_data_action;
typedef block::pingpong_action          block__pingpong_action;

HPX_REGISTER_COMPONENT(block_type, block);
HPX_REGISTER_ACTION(block::get_data_action, block__get_data_action);
HPX_REGISTER_ACTION(block::pingpong_action, block__pingpong_action);


////////////////////////////////////////////////////////////////////
int main(){
    std::vector<hpx::id_type> locs = hpx::find_all_localities();

    std::size_t minsize=1e3;
    std::size_t maxsize=1e8;
    std::size_t ntries = 100;

    block__get_data_action     act_data;
    block__pingpong_action     act_pingpong;

    for(std::size_t size = minsize; size<=maxsize; size*=2){
        hpx::id_type remote_block = hpx::new_<block_type>(locs[1], size).get();
        double Mb_size=size*sizeof(double)/1.e6;

        hpx::cout << "Size = " << Mb_size << " MB.";  

        //---------------- Bandwidth ------------------

        double seconds_bandwidth=0;
        std::vector<double>  buffer(size);
        for(int i=0; i<ntries; i++){
            auto t = std::chrono::high_resolution_clock::now();
            buffer = act_data(remote_block);
            auto elapsed = std::chrono::high_resolution_clock::now() - t;
            seconds_bandwidth+=std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count()/1.e6;
        }   
        seconds_bandwidth/=double(ntries);
        hpx::cout << "\t Bandwidth = " << Mb_size/seconds_bandwidth << " MB/s.";     

        //---------------- PingPong ------------------

        double microseconds_pingpong=0;
        int intbuffer=0;
        for(int i=0; i<ntries; i++){
            auto t = std::chrono::high_resolution_clock::now();
            intbuffer=act_pingpong(remote_block);
            auto elapsed = std::chrono::high_resolution_clock::now() - t;
            microseconds_pingpong+=std::chrono::duration_cast<std::chrono::microseconds>(elapsed).count();
        }
        microseconds_pingpong/=double(ntries);
        hpx::cout << "\t PingPong = " << microseconds_pingpong << " microseconds. " << hpx::endl;
    }

  return 0;
}

Running the test on an Intel Omnipath cluster, I get the following values:

Size = 0.008 MB.         Bandwidth = 30.2058 MB/s.       PingPong = 157.67 microseconds. 
Size = 0.016 MB.         Bandwidth = 75.5929 MB/s.       PingPong = 143.98 microseconds. 
Size = 0.032 MB.         Bandwidth = 143.639 MB/s.       PingPong = 153.12 microseconds. 
Size = 0.064 MB.         Bandwidth = 256.966 MB/s.       PingPong = 142 microseconds. 
Size = 0.128 MB.         Bandwidth = 343.744 MB/s.       PingPong = 148.17 microseconds. 
Size = 0.256 MB.         Bandwidth = 389.371 MB/s.       PingPong = 143.38 microseconds. 
Size = 0.512 MB.         Bandwidth = 618.589 MB/s.       PingPong = 153.1 microseconds. 
Size = 1.024 MB.         Bandwidth = 821.764 MB/s.       PingPong = 148.94 microseconds. 
Size = 2.048 MB.         Bandwidth = 1003.29 MB/s.       PingPong = 146.17 microseconds. 
Size = 4.096 MB.         Bandwidth = 201.063 MB/s.       PingPong = 158.39 microseconds. 
Size = 8.192 MB.         Bandwidth = 91.1075 MB/s.       PingPong = 153.49 microseconds. 
Size = 16.384 MB.        Bandwidth = 1655.55 MB/s.       PingPong = 147.72 microseconds. 
Size = 32.768 MB.        Bandwidth = 407.986 MB/s.       PingPong = 151.03 microseconds. 
Size = 65.536 MB.        Bandwidth = 427.471 MB/s.       PingPong = 149.75 microseconds. 
Size = 131.072 MB.       Bandwidth = 295.531 MB/s.       PingPong = 147.37 microseconds. 
Size = 262.144 MB.       Bandwidth = 513.221 MB/s.       PingPong = 146.4 microseconds. 
Size = 524.288 MB.       Bandwidth = 708.265 MB/s.       PingPong = 147.14 microseconds. 

What sounds strange to me is that:

  • Bandwidth is very unstable, with oscillations from around 100 MB/s up to more than 1000 MB/s
  • Bandwidth is really low, with respect to the expected value on this network. This is confirmed by an implementation of this example in pure MPI, which provides a bandwidth at least 10 times better.
  • PingPong time is really high, and turns out to be about 20 times higher than a pure MPI pingpong. Actually, here the PingPong time is measured as the time needed to invoke a remote action that returns an integer, while the pure MPI implementation consists in sending and receiving an integer.

Thus, I have the following questions:

  • Is the low bandwidth a normal behavior of MPI parcelport in HPX? If yes, why? If no, what could be a possible cause for that?
  • Is it a fair way to measure PingPong time with HPX? Is is comparable with a pure MPI implementation?
  • Is the high PingPong time normal?

If you need more info, for example about the HPX compilation and configuration, please don't hesitate to ask. I'm approaching HPX in this days, so I may have put some mistakes in my code or something non-optimal (sorry for this).

Thanks a lot!

0 Answers0