1

I've been banging my head on this one for some time now and hope one of you may be able to point me int he right direction.

The issue is that whenever requests are passed to the broker FAST, not all of them make it to the (single) worker.

If I introduce some delay between the requests (see sleep(1) in the client code), all works just fine, but obviously, that's not acceptable

For reproduction of an issue I am experiencing, I created this simplified version of my code:

Client:

#include <stdio.h>
#include <stdlib.h>

#include "czmq.h"
#include "majordomo_library.h"


#define SAFEFREE(x)                                                            \
  if (x) {                                                                     \
    free(x);                                                                   \
    x = NULL;                                                                  \
  }

int main() {

  char service[] = "bb-test";
  char endpoint[] = "ipc:///tmp/bbtest.ipc";

  mdp_client_t **clients = NULL;
  zmsg_t *request = NULL;

  char request_str[128];
  char *cmd = NULL, *reply = NULL;
  int i = 0, loops = 10;

  /* Create array of ptr for <loop> clients */
  clients = calloc(loops, sizeof(mdp_client_t *));
  assert(clients != NULL);

  /* create <loops> client sessions and send a request on each */
  for (i = 0; i < loops; i++) {
    /* create a new MDP client session */
    clients[i] = mdp_client_new(endpoint);
    if (!clients[i]) {
      fprintf(stderr, "Error %s\r\n", mdp_client_reason(clients[i]));
      exit(-1);
    }
    /* create new request message */
    request = zmsg_new();
    assert(request != NULL);
    memset(request_str, 0, 128);
    sprintf(request_str, "Request %d", i);
    zmsg_addstr(request, request_str);
    /* send the message as an MDP client request */
    if(mdp_client_request(clients[i], service, &request) ==0 ) {
      fprintf(stdout, "%s sent\r\n", request_str);
    } else {
      fprintf(stderr, "%s NOT SENT (%s)\r\n", request_str, mdp_client_reason(clients[i]));
    }

    zmsg_destroy(&request);

    /* If I add sleep time here, so the worker can process the
     * request and send the reply back, it works just fine.
     * As soon as a drop all requests to the broker, the worker gets
     * stuck at zsock_recv() stuck after processing only one, or a
     * subset of the requests )
     * */
    //sleep(1);
  }

  /*  collect the replies       */
  for (i = 0; i < loops; i++) {

    /* create a message pipe to read the replies */
    zsock_t *client_sock = mdp_client_msgpipe(clients[i]);
    assert(client_sock);
    /* set receive timeout (60s) */
    zsock_set_rcvtimeo(client_sock, 10000);
    /* get the message as "ss" (string and string) into cmd and reply*/
    if (zsock_recv(client_sock, "ss", &cmd, &reply) == 0) {
      fprintf(stdout, "Received: %s: %s\r\n", cmd, reply);
    } else {
      fprintf(stderr, "Failed to receive reply %s\r\n",
              mdp_client_reason(clients[i]));
    }

    /* close the message pipe */
    zmq_close(client_sock);

    /* destroy the client session */
    if (clients[i]) {
      mdp_client_destroy(&clients[i]);
    }

    SAFEFREE(cmd);
    SAFEFREE(reply);
  }

  return 0;
}

here's how I start the the default mdp_broker:

#include <stdio.h>
#include <stdlib.h>

#include "czmq.h"
#include "mdp_broker.h"

int main() {

  int rc = 0;

  zactor_t *broker = zactor_new(mdp_broker, "test_MDP-broker");
  assert(broker != NULL);
  zstr_send(broker, "VERBOSE");
  zstr_sendx(broker, "BIND", "ipc:///tmp/bbtest.ipc", NULL);


  getchar();

  zactor_destroy(&broker);

  exit(0);
}

and finally, here's the worker:

#include <stdio.h>
#include <stdlib.h>

#include "czmq.h"
#include "mdp_worker.h"

#define SAFEFREE(x)                                                            \
  if (x) {                                                                     \
    free(x);                                                                   \
    (x) = NULL;                                                                \
  }

int main() {

  char service[] = "bb-test";
  char endpoint[] = "ipc:///tmp/bbtest.ipc";

  mdp_worker_t *worker_session = NULL;
  zsock_t *worker_sock = NULL;
  zframe_t *address = NULL;

  char *cmd = NULL;
  char *request = NULL;
  char *reply = NULL;
  int rc = 0;

  /* create new worker and register the service with the broker */
  worker_session = mdp_worker_new(endpoint, service);
  assert(worker_session != NULL);
  mdp_worker_set_verbose(worker_session);

  worker_sock = mdp_worker_msgpipe(worker_session);
  assert(worker_sock != NULL);

  while (1) {

    rc = zsock_recv(worker_sock, "sfs", &cmd, &address, &request);
    if (rc != 0) {
      fprintf(stderr, "Failed to receive message: %s\r\n",
              mdp_worker_reason(worker_session));
      continue;
    }

    fprintf(stdout, "Got message \"%s\"\r\n", request);

    reply = calloc(strlen(request) + 10, sizeof(char));
    assert(reply != NULL);
    snprintf(reply, strlen(request) + 10, "%s - reply", request);

    /*  Create reply message */
    zmsg_t *msg_response = zmsg_new();
    assert(msg_response != NULL);

    /* Send */
    rc = zmsg_addstr(msg_response, reply);
    assert(rc == 0);

    rc = mdp_worker_send_final(worker_session, &address, &msg_response);
    fprintf(rc == 0 ? stdout : stderr, "Sending reply (\"%s\") was %s\r\n\r\n",
            reply, rc == 0 ? "successful" : "UNSUCCESSFUL");

    zmsg_destroy(&msg_response);
    SAFEFREE(cmd)
    SAFEFREE(request)
    SAFEFREE(reply)
  }
  mdp_worker_destroy(&worker_session);
  exit(0);
}

The results with sleep(1)

Client:

D: 20-04-10 20:59:35 connected to ipc:///tmp/bbtest.ipc
Request 0 sent
D: 20-04-10 20:59:36 connected to ipc:///tmp/bbtest.ipc
Request 1 sent
D: 20-04-10 20:59:37 connected to ipc:///tmp/bbtest.ipc
Request 2 sent
D: 20-04-10 20:59:38 connected to ipc:///tmp/bbtest.ipc
Request 3 sent
D: 20-04-10 20:59:39 connected to ipc:///tmp/bbtest.ipc
Request 4 sent
D: 20-04-10 20:59:40 connected to ipc:///tmp/bbtest.ipc
Request 5 sent
D: 20-04-10 20:59:41 connected to ipc:///tmp/bbtest.ipc
Request 6 sent
D: 20-04-10 20:59:42 connected to ipc:///tmp/bbtest.ipc
Request 7 sent
D: 20-04-10 20:59:43 connected to ipc:///tmp/bbtest.ipc
Request 8 sent
D: 20-04-10 20:59:44 connected to ipc:///tmp/bbtest.ipc
Request 9 sent
Received: FINAL: Request 0 - reply
Received: FINAL: Request 1 - reply
Received: FINAL: Request 2 - reply
Received: FINAL: Request 3 - reply
Received: FINAL: Request 4 - reply
Received: FINAL: Request 5 - reply
Received: FINAL: Request 6 - reply
Received: FINAL: Request 7 - reply
Received: FINAL: Request 8 - reply
Received: FINAL: Request 9 - reply

Process finished with exit code 0

Worker:

D: 20-04-10 20:59:32 connected to ipc:///tmp/bbtest.ipc
Got message "Request 0"
Sending reply ("Request 0 - reply") was successful

Got message "Request 1"
Sending reply ("Request 1 - reply") was successful

Got message "Request 2"
Sending reply ("Request 2 - reply") was successful

Got message "Request 3"
Sending reply ("Request 3 - reply") was successful

Got message "Request 4"
Sending reply ("Request 4 - reply") was successful

Got message "Request 5"
Sending reply ("Request 5 - reply") was successful

Got message "Request 6"
Sending reply ("Request 6 - reply") was successful

Got message "Request 7"
Sending reply ("Request 7 - reply") was successful

Got message "Request 8"
Sending reply ("Request 8 - reply") was successful

Got message "Request 9"
Sending reply ("Request 9 - reply") was successful

and without delay

Client:

D: 20-04-10 21:03:45 connected to ipc:///tmp/bbtest.ipc
Request 0 sent
D: 20-04-10 21:03:45 connected to ipc:///tmp/bbtest.ipc
Request 1 sent
D: 20-04-10 21:03:45 connected to ipc:///tmp/bbtest.ipc
Request 2 sent
D: 20-04-10 21:03:45 connected to ipc:///tmp/bbtest.ipc
Request 3 sent
D: 20-04-10 21:03:45 connected to ipc:///tmp/bbtest.ipc
Request 4 sent
D: 20-04-10 21:03:45 connected to ipc:///tmp/bbtest.ipc
Request 5 sent
D: 20-04-10 21:03:45 connected to ipc:///tmp/bbtest.ipc
Request 6 sent
D: 20-04-10 21:03:45 connected to ipc:///tmp/bbtest.ipc
Request 7 sent
D: 20-04-10 21:03:45 connected to ipc:///tmp/bbtest.ipc
Request 8 sent
D: 20-04-10 21:03:45 connected to ipc:///tmp/bbtest.ipc
Request 9 sent
Received: FINAL: Request 0 - reply
Received: FINAL: Request 1 - reply
Received: FINAL: Request 2 - reply
Received: FINAL: Request 3 - reply

Worker:

D: 20-04-10 21:03:40 connected to ipc:///tmp/bbtest.ipc
Got message "Request 0"
Sending reply ("Request 0 - reply") was successful

Got message "Request 1"
Sending reply ("Request 1 - reply") was successful

Got message "Request 2"
Sending reply ("Request 2 - reply") was successful

Got message "Request 3"
Sending reply ("Request 3 - reply") was successful

the worker blocks on

rc = zsock_recv(worker_sock, "sfs", &cmd, &address, &request);

The broker verbose output tells me all requests make it to the broker, but (in this case) only 3 WORKER_FINAL messages exist. The number of requests successfully handled varies, actually, it's not always just 3 but with growing number of requests, it breaks AT SOME POINT.

Any ideas? anyone?? pretty-please???

1 Answers1

0

I identified the issue. It is related to the mdp_broker. As of commit 603a304fb674733bd00c0314761242da013a327f from Sat Feb 29 10:20:52 2020, the broker does not dispatch queued requests unless there is either a "worker_ready" or a "client_request" event. So, if there are requests added to the queue while no worker is available, the total number of requests received and requests dispatched will diverge and some requests will remain in the queue unprocessed until they time out.

MDP-Broker needs to also check/dispatch any requests as soon as/as long as there are requests in the queue and a worker waiting - regardless of an incoming handle_request event.

So I added a call to s_dispatch() at the end of the handle_final() function in mdp_broker.c . It causes the broker to check for pending requests and send dispatch them, every time a worker is re-added to the list of waiting workers after it has processed a previous requests.

handle_final() in mdp_broker.c should therefore look like this:

static void
handle_worker_final (client_t *self)
{
    mdp_msg_t *msg = self->message;
    mdp_msg_t *client_msg = mdp_msg_new();
    // Set routing id, messageid, service, body
    zframe_t *address = mdp_msg_address(msg);

    mdp_msg_set_routing_id(client_msg, address);
    mdp_msg_set_id(client_msg, MDP_MSG_CLIENT_FINAL);
    const char *service_name = self->service_name;
    mdp_msg_set_service(client_msg, service_name);
    zmsg_t *body = mdp_msg_get_body(msg);
    mdp_msg_set_body(client_msg, &body);
    mdp_msg_send(client_msg, self->server->router);

    // Add the worker back to the list of waiting workers.
    char *identity = zframe_strhex(mdp_msg_routing_id(msg));

    worker_t *worker =
        (worker_t *) zhash_lookup(self->server->workers, identity);
    assert(worker);
    zlist_append(self->server->waiting, worker);
    service_t *service = (service_t *) zhash_lookup(self->server->services,
        worker->service->name);
    assert(service);
    zlist_append(service->waiting, worker);

    zstr_free(&identity);
    mdp_msg_destroy(&client_msg);
    s_service_dispatch(service);
}

the fix has been submitted to the zmq/majordomo team. I will update this post again, after if was committed.

Joerg