3

My company is looking into using ZeroMQ as the transport mechanism. First I benchmarked the performance just to get a hunch of what I´m playing with.

So I created an application comparing zmq dealer-to-dealer setup against winsock. I meassured the round-time-trip of sending synchronous messages from a client to a server and then calculating the average.

Here server running winsock:

DWORD RunServerWINSOCKTest(DWORD dwPort)
{
    WSADATA wsaData;
    int iRet = WSAStartup(MAKEWORD(2, 2), &wsaData);
    if (iRet != NO_ERROR)
    {
        printf("WSAStartup failed with error: %d\n", iRet);
        return iRet;
    }

    struct addrinfo hints;
    ZeroMemory(&hints, sizeof(hints));
    hints.ai_family = AF_INET;
    hints.ai_socktype = SOCK_STREAM;
    hints.ai_protocol = IPPROTO_TCP;
    hints.ai_flags = AI_PASSIVE;

    struct addrinfo *result = NULL;
    iRet = getaddrinfo(NULL, std::to_string(dwPort).c_str(), &hints, &result);
    if (iRet != 0)
    {
        WSACleanup();
        return iRet;
    }

    SOCKET ListenSocket = socket(result->ai_family, result->ai_socktype, result->ai_protocol);
    if (ListenSocket == INVALID_SOCKET)
    {
        freeaddrinfo(result);
        WSACleanup();
        return WSAGetLastError();
    }

    iRet = bind(ListenSocket, result->ai_addr, (int)result->ai_addrlen);
    if (iRet == SOCKET_ERROR)
    {
        freeaddrinfo(result);
        closesocket(ListenSocket);
        WSACleanup();
        return WSAGetLastError();
    }

    freeaddrinfo(result);
    iRet = listen(ListenSocket, SOMAXCONN);
    if (iRet == SOCKET_ERROR)
    {
        closesocket(ListenSocket);
        WSACleanup();
        return WSAGetLastError();
    }

    while (true)
    {
        SOCKET ClientSocket = accept(ListenSocket, NULL, NULL);
        if (ClientSocket == INVALID_SOCKET)
        {
            closesocket(ListenSocket);
            WSACleanup();
            return WSAGetLastError();
        }
        char value = 0;
        setsockopt(ClientSocket, IPPROTO_TCP, TCP_NODELAY, &value, sizeof(value));

        char recvbuf[DEFAULT_BUFLEN];
        int recvbuflen = DEFAULT_BUFLEN;
        do {

            iRet = recv(ClientSocket, recvbuf, recvbuflen, 0);
            if (iRet > 0) {
            // Echo the buffer back to the sender
                int iSendResult = send(ClientSocket, recvbuf, iRet, 0);
                if (iSendResult == SOCKET_ERROR)
                {
                    closesocket(ClientSocket);
                    WSACleanup();
                    return WSAGetLastError();
                }
            }
            else if (iRet == 0)
                printf("Connection closing...\n");
            else  {
                closesocket(ClientSocket);
                WSACleanup();
                return 1;
            }

        } while (iRet > 0);

        iRet = shutdown(ClientSocket, SD_SEND);
        if (iRet == SOCKET_ERROR)
        {
            closesocket(ClientSocket);
            WSACleanup();
            return WSAGetLastError();
        }
        closesocket(ClientSocket);
    }
    closesocket(ListenSocket);

    return WSACleanup();
}

Here is the client running winsock:

DWORD RunClientWINSOCKTest(std::string strAddress, DWORD dwPort, DWORD dwMessageSize)
{
    WSADATA wsaData;
    int iRet = WSAStartup(MAKEWORD(2, 2), &wsaData);
    if (iRet != NO_ERROR)
    {
        return iRet;
    }

    SOCKET ConnectSocket = INVALID_SOCKET;
    struct addrinfo *result = NULL,  *ptr = NULL, hints;


    ZeroMemory(&hints, sizeof(hints));
    hints.ai_family = AF_UNSPEC;
    hints.ai_socktype = SOCK_STREAM;
    hints.ai_protocol = IPPROTO_TCP;

    int iResult = getaddrinfo(strAddress.c_str(), std::to_string(dwPort).c_str(), &hints, &result);
    if (iResult != 0) {
        WSACleanup();
        return 1;
    }

    for (ptr = result; ptr != NULL; ptr = ptr->ai_next) {
        ConnectSocket = socket(ptr->ai_family, ptr->ai_socktype, ptr->ai_protocol);
        if (ConnectSocket == INVALID_SOCKET) {
            WSACleanup();
            return 1;
        }

        iResult = connect(ConnectSocket, ptr->ai_addr, (int)ptr->ai_addrlen);
        if (iResult == SOCKET_ERROR) {
            closesocket(ConnectSocket);
            ConnectSocket = INVALID_SOCKET;
            continue;
        }
        break;
    }

    freeaddrinfo(result);

    if (ConnectSocket == INVALID_SOCKET) {
        WSACleanup();
        return 1;
    }


    // Statistics
    UINT64 uint64BytesTransmitted = 0;
    UINT64 uint64StartTime = s_TimeStampGenerator.GetHighResolutionTimeStamp();
    UINT64 uint64WaitForResponse = 0;

    DWORD dwMessageCount = 1000000;

    CHAR cRecvMsg[DEFAULT_BUFLEN];
    SecureZeroMemory(&cRecvMsg, DEFAULT_BUFLEN);

    std::string strSendMsg(dwMessageSize, 'X');

    for (DWORD dwI = 0; dwI < dwMessageCount; dwI++)
    {
        int iRet = send(ConnectSocket, strSendMsg.data(), strSendMsg.size(), 0);
        if (iRet == SOCKET_ERROR) {
            closesocket(ConnectSocket);
            WSACleanup();
            return 1;
        }
        uint64BytesTransmitted += strSendMsg.size();

        UINT64 uint64BeforeRespone = s_TimeStampGenerator.GetHighResolutionTimeStamp();
        iRet = recv(ConnectSocket, cRecvMsg, DEFAULT_BUFLEN, 0);
        if (iRet < 1)
        {
            closesocket(ConnectSocket);
            WSACleanup();
            return 1;
        }
        std::string strMessage(cRecvMsg);

        if (strMessage.compare(strSendMsg) == 0)
        {
            uint64WaitForResponse += (s_TimeStampGenerator.GetHighResolutionTimeStamp() - uint64BeforeRespone);
        }
        else
        {
            return NO_ERROR;
        }
}

    UINT64 uint64ElapsedTime = s_TimeStampGenerator.GetHighResolutionTimeStamp() - uint64StartTime;
    PrintResult(uint64ElapsedTime, uint64WaitForResponse, dwMessageCount, uint64BytesTransmitted, dwMessageSize);

    iResult = shutdown(ConnectSocket, SD_SEND);
    if (iResult == SOCKET_ERROR) {
        closesocket(ConnectSocket);
        WSACleanup();
        return 1;
    }
    closesocket(ConnectSocket);
    return WSACleanup();
}

Here is the server running ZMQ (dealer)

DWORD RunServerZMQTest(DWORD dwPort)
{
    try
    {
        zmq::context_t context(1);
        zmq::socket_t server(context, ZMQ_DEALER);

        // Set options here
        std::string strIdentity = s_set_id(server);
        printf("Created server connection with ID: %s\n", strIdentity.c_str());

        std::string strConnect = "tcp://*:" + std::to_string(dwPort);
        server.bind(strConnect.c_str());

        bool bRunning = true;
        while (bRunning)
        {
            std::string strMessage = s_recv(server);

            if (!s_send(server, strMessage))
            {
                return NO_ERROR;
            }
        }
    }
    catch (zmq::error_t& e)
    {
        return (DWORD)e.num();
    }

return NO_ERROR;

}

Here is the client running ZMQ (dealer)

DWORD RunClientZMQTest(std::string strAddress, DWORD dwPort, DWORD dwMessageSize)
{
    try
    {
        zmq::context_t ctx(1);
        zmq::socket_t client(ctx, ZMQ_DEALER); // ZMQ_REQ

        // Set options here
        std::string strIdentity = s_set_id(client);

        std::string strConnect = "tcp://" + strAddress + ":" + std::to_string(dwPort);
        client.connect(strConnect.c_str());

        if(s_send(client, "INIT"))
        {
            std::string strMessage = s_recv(client);
            if (strMessage.compare("INIT") == 0)
            {
                printf("Client[%s] connected to: %s\n", strIdentity.c_str(), strConnect.c_str());
            }
            else
            {
                return NO_ERROR;
            }
        }
        else
        {
            return NO_ERROR;
        }


        // Statistics
        UINT64 uint64BytesTransmitted   = 0;
        UINT64 uint64StartTime          = s_TimeStampGenerator.GetHighResolutionTimeStamp();
        UINT64 uint64WaitForResponse    = 0;

        DWORD dwMessageCount = 10000000;


        std::string strSendMsg(dwMessageSize, 'X');
        for (DWORD dwI = 0; dwI < dwMessageCount; dwI++)
        {
            if (s_send(client, strSendMsg))
            {
                uint64BytesTransmitted += strSendMsg.size();

                UINT64 uint64BeforeRespone = s_TimeStampGenerator.GetHighResolutionTimeStamp();
                std::string strRecvMsg = s_recv(client);
                if (strRecvMsg.compare(strSendMsg) == 0)
                {
                    uint64WaitForResponse += (s_TimeStampGenerator.GetHighResolutionTimeStamp() - uint64BeforeRespone);
                }
                else
                {
                    return NO_ERROR;
                }
            }
            else
            {
                return NO_ERROR;
            }
        }
        UINT64 uint64ElapsedTime = s_TimeStampGenerator.GetHighResolutionTimeStamp() - uint64StartTime;
        PrintResult(uint64ElapsedTime, uint64WaitForResponse, dwMessageCount, uint64BytesTransmitted, dwMessageSize);
    }
    catch (zmq::error_t& e)
    {
        return (DWORD)e.num();
    }

    return NO_ERROR;
    }

Im running the benchmark locally with message size of 5 bytes and I get the following result:

WINSOCK

Messages sent:                 1 000 000
Time elapsed (us):            48 019 415
Time elapsed (s):                     48.019 415
Message size (bytes):                  5
Msg/s:                            20 825
Bytes/s:                         104 125
Mb/s:                                  0.099
Total   response time (us):   24 537 376
Average repsonse time (us):           24.0

and

ZeroMQ

Messages sent:                 1 000 000
Time elapsed (us):           158 290 708
Time elapsed (s):                    158.290 708    
Message size (bytes):                  5
Msg/s:                             6 317
Bytes/s:                          31 587
Mb/s:                                  0.030
Total   response time (us):  125 524 178    
Average response time (us):          125.0

Can anyone explain why the average response time is so much higher when using ZMQ?

The goal is to find a setup where I can send and receive messages asynchronously without the need to reply. If this can be achieved with a different setup than dealer-dealer, please let me know!

user3666197
  • 1
  • 6
  • 50
  • 92
rhedin
  • 71
  • 1
  • 3
  • 20
  • I had a quick look on the 0MQ website and found mostly commercial support and bug-tracking, for which this question seems not suitable (especially since you're still investigating). It is a tricky question, because adding some stack on top of sockets is expected to add delay, but this is pretty much. The main difference in the test is the use of std::string which could be making a difference (but not that much). Also, make sure you have the compiler optimization flags on in release mode to make sure no extra checks are involved in the test. – stefaanv Oct 07 '14 at 12:41
  • I use the same compiler optimization flags for both implementations so that should be an issue. The std::string is only created once, then the data is referenced to multiple times, so it cant be that. Thanks for the input – rhedin Oct 07 '14 at 12:56
  • There are tests that compare dynamic arrays with vectors and show that vectors are much slower with same optimization flags, but it turned out that with these flags extra checks were done on the vectors that weren't used in release mode with optimization on. A string is received each time by the server and client and sent back by the server. Even though the string is on the stack, its actual data can be dynamic. – stefaanv Oct 07 '14 at 13:11
  • Mind you that I don't expect the std::string to make much difference compared to the stack handling and network traffic. The flags could. And by the way, did you already look at the traffic with a network monitor like wireshark? – stefaanv Oct 07 '14 at 13:13
  • Ok, so first off I change the optimization flag to "Full Optimization" which gave me an improvement for both winsock (Average repsonse time (us): 17.0) and zmq (Average repsonse time (us): 111.0). I also use char arrays at the same places for both implementations but that did not make a difference. I think the increased latency has to to with messages being queued before being sent. Any thoughts on that? – rhedin Oct 08 '14 at 08:24
  • 1
    You also mentioned using wireshark, did that show anything? Queuing messages is possible. You need someone with thorough ZeroMQ knowledge to verify that. The bottom line is that, certainly for simple cases, adding stacks and libraries normally increase delays, even if what you tested seems more than expected. The main questions are 1. is the extra delay acceptable? 2. is the added benefit of easier traffic handling worth the extra delay? 3. is the delay difference better in real life situations? – stefaanv Oct 08 '14 at 12:06

3 Answers3

4

You say you want to send and receive messages asynchronously without the need to reply. Yet the tests done so far are all completely synchronous, essentially request-reply, but on a dealer-dealer socket. Something doesn't compute there. Why not run tests that mimic more closely the design you are aiming for?

ZeroMQ gets a fair amount of it's "faster than TCP" performance by aggregating queued messages into a single message. Obviously, that mechanism cannot be activated in a purely synchronous design with only one message in flight at a time.

As for why this particular test, of very small messages being sent and received purely synchronously, is relatively slow, I cannot say. Have you done profiling? What I will say, again, is that running this test and basing decisions on it doesn't make sense if it doesn't look anything like your final design.

One thing that does look odd is the try/catch block in the ZeroMQ code. That doesn't look fair because the winsock test wasn't written that way. It is known that there is/was a fair amount of overhead in try/catch.

John Jefferies
  • 1,176
  • 7
  • 13
  • The reason I perform this particular test is because I have to use dealer-dealer setup for asynchronous messaging (as far as I know) but I want to figure out the overhead. We have a "working" solution where we send messages asynchronously but we get bad performance there aswell, so thats why I´m doing low level tests. I will try by removing the try/catch, I´m currently working on profiling and looking at the trafic with wireshark. – rhedin Oct 08 '14 at 06:21
  • Removed the try/catch, made no difference – rhedin Oct 08 '14 at 06:43
4

This is only sort of an answer to a little part of your question, but here goes -

Why do you need dealer/dealer? I assume because communication can initiate from either point? You're not tied to dealer/dealer, in particular it limits you to only two endpoints, if you ever add another endpoint on either side of the communication, say, a second client, then each client will only receive half the messages because dealer is strictly round-robin.

What you need for asynchronous communication is some combination of dealer and or router sockets. Neither requires a response, the main differences are in how they choose which connected peer to send a message to:

  • Dealer, as said, is strictly round robin, it will send to each connected peer in series
  • Router is strictly an addressed message, you have to know the "name" of the peer you want to send to to get the message there.

These two socket types work together because dealer sockets (and request sockets, dealer is a "request-type" socket) send their "name" as part of the message, which the router socket can use to send data back. This is a request/reply paradigm, and you'll see that sort of paradigm enforced in all of the examples in the guide, but you can bend that paradigm to what you're looking for, in particular neither dealer nor router require a reply.

Without knowing your full requirements I can't tell you what sort of ZMQ architecture I would choose, but in general I prefer the expandability of router sockets, it's easier to handle appropriate addressing than it is to shoehorn everything into a single peer... you'll see warnings against doing router/router, and I agree with them to the extent that you should understand what you're doing before you try it, but understanding what you're doing, the implementation isn't that hard.


You also have the option, if it fits your requirements, to set up each end with a pub socket, and each with a sub socket, if there are literally no replies ever. If it's strictly a data feed from source to target, and neither peer needs any feedback on what it sends, then this is probably the best choice, even though it means you're dealing with two sockets per end rather than one.


None of this addresses performance directly, but the important thing to understand is that zmq sockets are optimized for particular use cases, and as pointed out in John Jefferies' answer, you're breaking that use case for your dealer socket by making the messaging in your test strictly synchronous. The first place to start is to finalize your ZMQ architecture, and then simulate an actual message flow, in particular not adding in arbitrary waits and synchronicity, which will necessarily change the way throughput looks as you're testing it, pretty much by definition.

Jason
  • 13,606
  • 2
  • 29
  • 40
  • Thanks for the comprehensive respones! The design I'm working on has to be able to handle asynchonous messaging in both directions. I have a working solution, but the performance is approximately 50% worse than of my winsock solution (IOCP). That motivated me to go down lower into the implementation to find potential bottlenecks which lead me to this unit test. The reason I asked this question was to get some guidlines of what to do and not to do. I will take your advice and look into the router-router design, hopefully it will yield better performance! Thanks again – rhedin Oct 09 '14 at 06:40
  • I wouldn't necessarily assume that router/router would yield better performance, in particular if your communication is guaranteed to be one-to-one then dealer/dealer will be *much* simpler, and if it's guaranteed to be many-to-one, then dealer/router will also be *much* simpler. If it might be many-to-many, then router/router could make sense or a multi-socket design could make more sense, depending on the particulars of your situation. In either event, the important thing you need to remember about dealer and router is that they are optimized for async communication, try not to break that – Jason Oct 09 '14 at 12:28
  • I see your point, one more thing though. I also tried the same application but with REQ-REP (which is more suitable for synchronous transmission, right?) but it did not yield any improvements either. What is your thoughts on that, or what is the appropriate socket type to use for synchronous transmission? Thanks again for the input – rhedin Oct 10 '14 at 07:30
  • 1
    I'm not at all surprised that REQ-REP gives the same performance with that test. The REQ/REP socket types are based on DEALER/ROUTER, but with the essential difference that they enforce synchronous messaging. Since your test code already enforces synchronous messaging, it follows that performance will be the same. As a rule, applications will want to do asynchronous messaging for some feature or other; pure REQ/REP is for training/beginners :-). – John Jefferies Oct 10 '14 at 08:10
  • 1
    Yes, as @JohnJefferies said, the performance issues are stemming from the messaging pattern, not the socket types. You'll only see what the true limits of ZMQ performance are by using asynchronous messaging. The reason I'm talking about socket types is because certain socket types won't hamstring that. Synchronous messaging is, by definition, more concerned with the *structure* of the conversation (request-reply-request-reply) than the *speed* of it. With a less opinionated protocol (like winsock), you'll maintain a higher level of speed, but it still won't approach what's possible with async. – Jason Oct 10 '14 at 12:43
2

The OPs issue is a matter of throughput, not latency and is likely a matter of the pattern that is being used in the provided examples. However, you are likely to always find that ZeroMQ has higher latency, which I'll explain, although it might not be useful to the OP in this situation.

ZeroMQ works by buffering messages. Imagine (just as a basic illustration) creating a std::string and appending many small strings to it (many thousands, each including a small header to know the size of these small segments) and then sending this larger string in intervals of 100us, 1000us, 10ms or whatever. On the receiving side, the large string is recived and each smaller message is removed one at a time based on the size header that is sent along with it. This allows you to potentially send millions of messages in batches (although std::string is obviously a bad choice) without the overhead of sending these millions of very small measages one at a time. As a result you take full advantage of your network resources and increase throughput, you also create basic FIFO behavior. However, you also create a delay to allow the buffer to fill, which means an increase in latency.

Imagine (again, just as a basic illustration): if you spend a half second (including string operations, etc) buffering a million messages, this will result in a larger string of a few megabytes. Modern networks can easily send this larger string in the remaining half second. 1000000us (1 second) / 1000000 messages would be 1us per message, right? Wrong - all messages had a half secnd delay to allow the queue to fill, resulting in an increase in latency of upto a half second for all messages. ZeroMQ sends batches much quicker than every 500ms, but the increase in latency that this illustrates is still incurred in ZeroMQ, though its usualy along the lines of a few ms.

user3666197
  • 1
  • 6
  • 50
  • 92
JSON
  • 1,819
  • 20
  • 27