0

For better or worse, this is how I have things set up now:

mongo client connects to:
  -> AWS Internal Network (TCP) Load balancer
    -> Traefik (reverse proxy)
      -> mongo (from docker image mongo:4.4.17)

(This setup was a quick hack to get mongodb in place. I've done a lot of reading debugging this issue and learned maybe my setup isn't the best. But I've found nothing to suggest it shouldn't work.)

Issue:

Connections will work until they don't. My 'test' is a simple node script that opens a db connection and execute db.find(), then logs that. It works a few times, then will hang. The mongodb logs suggest the client reaches its timeout and disconnects. Other than the client-side timeout, I see no errors to explain what mongo was doing for 30s.

If I make requests directly to Traefik or to mongo this issue goes away entirely. So it must be related to the request going through the NLB.

Some Logs:

mongodb logs (level 0): Test script (worked, got valid repsonse and exited cleanly):

{"t":{"$date":"2022-11-08T18:46:15.848+00:00"},"s":"I",  "c":"NETWORK",  "id":22943,   "ctx":"listener","msg":"Connection accepted","attr":{"remote":"10.20.21.43:57206","connectionId":13,"connectionCount":3}}
{"t":{"$date":"2022-11-08T18:46:15.853+00:00"},"s":"I",  "c":"NETWORK",  "id":51800,   "ctx":"conn13","msg":"client metadata","attr":{"remote":"10.20.21.43:57206","client":"conn13","doc":{"driver":{"name":"nodejs","version":"3.7.3"},"os":{"type":"Linux","name":"linux","architecture":"x64","version":"5.10.147-133.644.amzn2.x86_64"},"platform":"'Node.js v19.0.0, LE (unified)"}}}
{"t":{"$date":"2022-11-08T18:46:15.862+00:00"},"s":"I",  "c":"NETWORK",  "id":22943,   "ctx":"listener","msg":"Connection accepted","attr":{"remote":"10.20.31.176:42852","connectionId":14,"connectionCount":4}}
{"t":{"$date":"2022-11-08T18:46:15.862+00:00"},"s":"I",  "c":"NETWORK",  "id":51800,   "ctx":"conn14","msg":"client metadata","attr":{"remote":"10.20.31.176:42852","client":"conn14","doc":{"driver":{"name":"nodejs","version":"3.7.3"},"os":{"type":"Linux","name":"linux","architecture":"x64","version":"5.10.147-133.644.amzn2.x86_64"},"platform":"'Node.js v19.0.0, LE (unified)"}}}
{"t":{"$date":"2022-11-08T18:46:15.878+00:00"},"s":"I",  "c":"ACCESS",   "id":20250,   "ctx":"conn14","msg":"Authentication succeeded","attr":{"mechanism":"SCRAM-SHA-256","speculative":true,"principalName":"user","authenticationDatabase":"tracker","remote":"10.20.31.176:42852","extraInfo":{}}}
{"t":{"$date":"2022-11-08T18:46:15.892+00:00"},"s":"I",  "c":"NETWORK",  "id":22944,   "ctx":"conn13","msg":"Connection ended","attr":{"remote":"10.20.21.43:57206","connectionId":13,"connectionCount":3}}
{"t":{"$date":"2022-11-08T18:46:15.894+00:00"},"s":"I",  "c":"NETWORK",  "id":22944,   "ctx":"conn14","msg":"Connection ended","attr":{"remote":"10.20.31.176:42852","connectionId":14,"connectionCount":2}}

Exact same test script, run a few seconds later that timed out after 30 seconds:

{"t":{"$date":"2022-11-08T18:46:37.143+00:00"},"s":"I",  "c":"NETWORK",  "id":22943,   "ctx":"listener","msg":"Connection accepted","attr":{"remote":"10.20.21.43:49370","connectionId":15,"connectionCount":3}}
{"t":{"$date":"2022-11-08T18:46:37.147+00:00"},"s":"I",  "c":"NETWORK",  "id":51800,   "ctx":"conn15","msg":"client metadata","attr":{"remote":"10.20.21.43:49370","client":"conn15","doc":{"driver":{"name":"nodejs","version":"3.7.3"},"os":{"type":"Linux","name":"linux","architecture":"x64","version":"5.10.147-133.644.amzn2.x86_64"},"platform":"'Node.js v19.0.0, LE (unified)"}}}
{"t":{"$date":"2022-11-08T18:46:43.980+00:00"},"s":"I",  "c":"STORAGE",  "id":22430,   "ctx":"WTCheckpointThread","msg":"WiredTiger message","attr":{"message":"[1667933203:980496][1:0x7fc8a881e700], WT_SESSION.checkpoint: [WT_VERB_CHECKPOINT_PROGRESS] saving checkpoint snapshot min: 8, snapshot max: 8 snapshot count: 0, oldest timestamp: (0, 0) , meta checkpoint timestamp: (0, 0) base write gen: 195129"}}
{"t":{"$date":"2022-11-08T18:46:47.658+00:00"},"s":"I",  "c":"NETWORK",  "id":22943,   "ctx":"listener","msg":"Connection accepted","attr":{"remote":"10.20.31.176:42300","connectionId":16,"connectionCount":4}}
{"t":{"$date":"2022-11-08T18:46:47.658+00:00"},"s":"I",  "c":"NETWORK",  "id":51800,   "ctx":"conn16","msg":"client metadata","attr":{"remote":"10.20.31.176:42300","client":"conn16","doc":{"driver":{"name":"nodejs","version":"3.7.3"},"os":{"type":"Linux","name":"linux","architecture":"x64","version":"5.10.147-133.644.amzn2.x86_64"},"platform":"'Node.js v19.0.0, LE (unified)"}}}
{"t":{"$date":"2022-11-08T18:47:07.163+00:00"},"s":"I",  "c":"-",        "id":20883,   "ctx":"conn15","msg":"Interrupted operation as its client disconnected","attr":{"opId":2510}}
{"t":{"$date":"2022-11-08T18:47:07.163+00:00"},"s":"I",  "c":"NETWORK",  "id":22944,   "ctx":"conn15","msg":"Connection ended","attr":{"remote":"10.20.21.43:49370","connectionId":15,"connectionCount":3}}
{"t":{"$date":"2022-11-08T18:47:07.163+00:00"},"s":"I",  "c":"NETWORK",  "id":22944,   "ctx":"conn16","msg":"Connection ended","attr":{"remote":"10.20.31.176:42300","connectionId":16,"connectionCount":2}}

Same test script - Log Level 3: Unsuccessful (timeout at 30s)

{"t":{"$date":"2022-11-08T18:56:29.522+00:00"},"s":"I",  "c":"NETWORK",  "id":22943,   "ctx":"listener","msg":"Connection accepted","attr":{"remote":"10.20.31.176:57422","connectionId":2,"connectionCount":2}}
{"t":{"$date":"2022-11-08T18:56:29.523+00:00"},"s":"D3", "c":"EXECUTOR", "id":22983,   "ctx":"listener","msg":"Starting new executor thread in passthrough mode"}
{"t":{"$date":"2022-11-08T18:56:29.526+00:00"},"s":"D2", "c":"COMMAND",  "id":21965,   "ctx":"conn2","msg":"About to run the command","attr":{"db":"admin","commandArgs":{"ismaster":true,"client":{"driver":{"name":"nodejs","version":"3.7.3"},"os":{"type":"Linux","name":"linux","architecture":"x64","version":"5.10.147-133.644.amzn2.x86_64"},"platform":"'Node.js v19.0.0, LE (unified)"},"compression":[],"helloOk":true,"$db":"admin"}}}
{"t":{"$date":"2022-11-08T18:56:29.526+00:00"},"s":"I",  "c":"NETWORK",  "id":51800,   "ctx":"conn2","msg":"client metadata","attr":{"remote":"10.20.31.176:57422","client":"conn2","doc":{"driver":{"name":"nodejs","version":"3.7.3"},"os":{"type":"Linux","name":"linux","architecture":"x64","version":"5.10.147-133.644.amzn2.x86_64"},"platform":"'Node.js v19.0.0, LE (unified)"}}}
{"t":{"$date":"2022-11-08T18:56:29.526+00:00"},"s":"D3", "c":"NETWORK",  "id":22934,   "ctx":"conn2","msg":"Starting server-side compression negotiation"}
{"t":{"$date":"2022-11-08T18:56:29.527+00:00"},"s":"D3", "c":"NETWORK",  "id":22936,   "ctx":"conn2","msg":"No compressors provided"}
{"t":{"$date":"2022-11-08T18:56:29.527+00:00"},"s":"I",  "c":"COMMAND",  "id":51803,   "ctx":"conn2","msg":"Slow query","attr":{"type":"command","ns":"admin.$cmd","command":{"ismaster":true,"client":{"driver":{"name":"nodejs","version":"3.7.3"},"os":{"type":"Linux","name":"linux","architecture":"x64","version":"5.10.147-133.644.amzn2.x86_64"},"platform":"'Node.js v19.0.0, LE (unified)"},"compression":[],"helloOk":true,"$db":"admin"},"numYields":0,"reslen":329,"locks":{},"protocol":"op_query","durationMillis":0}}
{"t":{"$date":"2022-11-08T18:56:29.527+00:00"},"s":"D1", "c":"QUERY",    "id":22790,   "ctx":"conn2","msg":"Received interrupt request for unknown op","attr":{"opId":317}}
{"t":{"$date":"2022-11-08T18:56:29.527+00:00"},"s":"D2", "c":"QUERY",    "id":22783,   "ctx":"conn2","msg":"Ops known during interrupt","attr":{"ops":[]}}
{"t":{"$date":"2022-11-08T18:56:30.034+00:00"},"s":"D2", "c":"COMMAND",  "id":21965,   "ctx":"conn2","msg":"About to run the command","attr":{"db":"admin","commandArgs":{"hello":true,"helloOk":true,"maxAwaitTimeMS":10000,"topologyVersion":{"processId":{"$oid":"636aa6433c5de5c55d30c876"},"counter":0},"$db":"admin"}}}
{"t":{"$date":"2022-11-08T18:56:30.034+00:00"},"s":"D3", "c":"FTDC",     "id":23904,   "ctx":"conn2","msg":"Using maxAwaitTimeMS for awaitable isMaster protocol."}
{"t":{"$date":"2022-11-08T18:56:36.026+00:00"},"s":"D2", "c":"WRITE",    "id":21976,   "ctx":"abortExpiredTransactions","msg":"Scanning sessions","attr":{"sessionCount":0}}
{"t":{"$date":"2022-11-08T18:56:40.037+00:00"},"s":"I",  "c":"NETWORK",  "id":22943,   "ctx":"listener","msg":"Connection accepted","attr":{"remote":"10.20.31.176:57284","connectionId":3,"connectionCount":3}}
{"t":{"$date":"2022-11-08T18:56:40.037+00:00"},"s":"D3", "c":"EXECUTOR", "id":22983,   "ctx":"listener","msg":"Starting new executor thread in passthrough mode"}
{"t":{"$date":"2022-11-08T18:56:40.038+00:00"},"s":"D2", "c":"COMMAND",  "id":21965,   "ctx":"conn3","msg":"About to run the command","attr":{"db":"admin","commandArgs":{"ismaster":true,"client":{"driver":{"name":"nodejs","version":"3.7.3"},"os":{"type":"Linux","name":"linux","architecture":"x64","version":"5.10.147-133.644.amzn2.x86_64"},"platform":"'Node.js v19.0.0, LE (unified)"},"compression":[],"helloOk":true,"$db":"admin"}}}
{"t":{"$date":"2022-11-08T18:56:40.038+00:00"},"s":"I",  "c":"NETWORK",  "id":51800,   "ctx":"conn3","msg":"client metadata","attr":{"remote":"10.20.31.176:57284","client":"conn3","doc":{"driver":{"name":"nodejs","version":"3.7.3"},"os":{"type":"Linux","name":"linux","architecture":"x64","version":"5.10.147-133.644.amzn2.x86_64"},"platform":"'Node.js v19.0.0, LE (unified)"}}}
{"t":{"$date":"2022-11-08T18:56:40.038+00:00"},"s":"D3", "c":"NETWORK",  "id":22934,   "ctx":"conn3","msg":"Starting server-side compression negotiation"}
{"t":{"$date":"2022-11-08T18:56:40.038+00:00"},"s":"D3", "c":"NETWORK",  "id":22936,   "ctx":"conn3","msg":"No compressors provided"}
{"t":{"$date":"2022-11-08T18:56:40.038+00:00"},"s":"I",  "c":"COMMAND",  "id":51803,   "ctx":"conn3","msg":"Slow query","attr":{"type":"command","ns":"admin.$cmd","command":{"ismaster":true,"client":{"driver":{"name":"nodejs","version":"3.7.3"},"os":{"type":"Linux","name":"linux","architecture":"x64","version":"5.10.147-133.644.amzn2.x86_64"},"platform":"'Node.js v19.0.0, LE (unified)"},"compression":[],"helloOk":true,"$db":"admin"},"numYields":0,"reslen":329,"locks":{},"protocol":"op_query","durationMillis":0}}
{"t":{"$date":"2022-11-08T18:56:40.038+00:00"},"s":"D1", "c":"QUERY",    "id":22790,   "ctx":"conn3","msg":"Received interrupt request for unknown op","attr":{"opId":452}}
{"t":{"$date":"2022-11-08T18:56:40.038+00:00"},"s":"D2", "c":"QUERY",    "id":22783,   "ctx":"conn3","msg":"Ops known during interrupt","attr":{"ops":[]}}
{"t":{"$date":"2022-11-08T18:56:40.044+00:00"},"s":"D3", "c":"NETWORK",  "id":22934,   "ctx":"conn2","msg":"Starting server-side compression negotiation"}
{"t":{"$date":"2022-11-08T18:56:40.044+00:00"},"s":"D3", "c":"NETWORK",  "id":22935,   "ctx":"conn2","msg":"Compression negotiation not requested by client"}
{"t":{"$date":"2022-11-08T18:56:40.044+00:00"},"s":"D3", "c":"FTDC",     "id":23905,   "ctx":"conn2","msg":"Using exhaust for isMaster or hello protocol"}
{"t":{"$date":"2022-11-08T18:56:40.044+00:00"},"s":"I",  "c":"COMMAND",  "id":51803,   "ctx":"conn2","msg":"Slow query","attr":{"type":"command","ns":"admin.$cmd","command":{"hello":true,"helloOk":true,"maxAwaitTimeMS":10000,"topologyVersion":{"processId":{"$oid":"636aa6433c5de5c55d30c876"},"counter":0},"$db":"admin"},"numYields":0,"reslen":323,"locks":{},"protocol":"op_msg","durationMillis":0}}
{"t":{"$date":"2022-11-08T18:56:40.044+00:00"},"s":"D1", "c":"QUERY",    "id":22790,   "ctx":"conn2","msg":"Received interrupt request for unknown op","attr":{"opId":326}}
{"t":{"$date":"2022-11-08T18:56:40.044+00:00"},"s":"D2", "c":"QUERY",    "id":22783,   "ctx":"conn2","msg":"Ops known during interrupt","attr":{"ops":[]}}
{"t":{"$date":"2022-11-08T18:56:40.045+00:00"},"s":"D2", "c":"COMMAND",  "id":21965,   "ctx":"conn2","msg":"About to run the command","attr":{"db":"admin","commandArgs":{"hello":true,"helloOk":true,"maxAwaitTimeMS":10000,"topologyVersion":{"processId":{"$oid":"636aa6433c5de5c55d30c876"},"counter":0},"$db":"admin"}}}
{"t":{"$date":"2022-11-08T18:56:40.045+00:00"},"s":"D3", "c":"FTDC",     "id":23904,   "ctx":"conn2","msg":"Using maxAwaitTimeMS for awaitable isMaster protocol."}
{"t":{"$date":"2022-11-08T18:56:50.043+00:00"},"s":"D2", "c":"COMMAND",  "id":21965,   "ctx":"conn3","msg":"About to run the command","attr":{"db":"admin","commandArgs":{"ismaster":1,"$db":"admin"}}}
{"t":{"$date":"2022-11-08T18:56:50.043+00:00"},"s":"D3", "c":"NETWORK",  "id":22934,   "ctx":"conn3","msg":"Starting server-side compression negotiation"}
{"t":{"$date":"2022-11-08T18:56:50.043+00:00"},"s":"D3", "c":"NETWORK",  "id":22935,   "ctx":"conn3","msg":"Compression negotiation not requested by client"}
{"t":{"$date":"2022-11-08T18:56:50.043+00:00"},"s":"I",  "c":"COMMAND",  "id":51803,   "ctx":"conn3","msg":"Slow query","attr":{"type":"command","ns":"admin.$cmd","command":{"ismaster":1,"$db":"admin"},"numYields":0,"reslen":304,"locks":{},"protocol":"op_msg","durationMillis":0}}
{"t":{"$date":"2022-11-08T18:56:50.043+00:00"},"s":"D1", "c":"QUERY",    "id":22790,   "ctx":"conn3","msg":"Received interrupt request for unknown op","attr":{"opId":579}}
{"t":{"$date":"2022-11-08T18:56:50.043+00:00"},"s":"D2", "c":"QUERY",    "id":22783,   "ctx":"conn3","msg":"Ops known during interrupt","attr":{"ops":[]}}
{"t":{"$date":"2022-11-08T18:56:50.055+00:00"},"s":"D3", "c":"NETWORK",  "id":22934,   "ctx":"conn2","msg":"Starting server-side compression negotiation"}
{"t":{"$date":"2022-11-08T18:56:50.055+00:00"},"s":"D3", "c":"NETWORK",  "id":22935,   "ctx":"conn2","msg":"Compression negotiation not requested by client"}
{"t":{"$date":"2022-11-08T18:56:50.055+00:00"},"s":"D3", "c":"FTDC",     "id":23905,   "ctx":"conn2","msg":"Using exhaust for isMaster or hello protocol"}
{"t":{"$date":"2022-11-08T18:56:50.055+00:00"},"s":"I",  "c":"COMMAND",  "id":51803,   "ctx":"conn2","msg":"Slow query","attr":{"type":"command","ns":"admin.$cmd","command":{"hello":true,"helloOk":true,"maxAwaitTimeMS":10000,"topologyVersion":{"processId":{"$oid":"636aa6433c5de5c55d30c876"},"counter":0},"$db":"admin"},"numYields":0,"reslen":323,"locks":{},"protocol":"op_msg","durationMillis":0}}
{"t":{"$date":"2022-11-08T18:56:50.055+00:00"},"s":"D1", "c":"QUERY",    "id":22790,   "ctx":"conn2","msg":"Received interrupt request for unknown op","attr":{"opId":453}}
{"t":{"$date":"2022-11-08T18:56:50.055+00:00"},"s":"D2", "c":"QUERY",    "id":22783,   "ctx":"conn2","msg":"Ops known during interrupt","attr":{"ops":[]}}
{"t":{"$date":"2022-11-08T18:56:50.055+00:00"},"s":"D2", "c":"COMMAND",  "id":21965,   "ctx":"conn2","msg":"About to run the command","attr":{"db":"admin","commandArgs":{"hello":true,"helloOk":true,"maxAwaitTimeMS":10000,"topologyVersion":{"processId":{"$oid":"636aa6433c5de5c55d30c876"},"counter":0},"$db":"admin"}}}
{"t":{"$date":"2022-11-08T18:56:50.055+00:00"},"s":"D3", "c":"FTDC",     "id":23904,   "ctx":"conn2","msg":"Using maxAwaitTimeMS for awaitable isMaster protocol."}
{"t":{"$date":"2022-11-08T18:56:54.149+00:00"},"s":"D1", "c":"EXECUTOR", "id":23106,   "ctx":"AuthorizationManager-0","msg":"Reaping this thread","attr":{"nextThreadRetirementDate":{"$date":"2022-11-08T18:57:24.149Z"}}}
{"t":{"$date":"2022-11-08T18:56:54.149+00:00"},"s":"D1", "c":"EXECUTOR", "id":23105,   "ctx":"AuthorizationManager-0","msg":"Shutting down thread","attr":{"threadName":"AuthorizationManager-0","poolName":"AuthorizationManager"}}
{"t":{"$date":"2022-11-08T18:56:59.543+00:00"},"s":"I",  "c":"-",        "id":20883,   "ctx":"conn2","msg":"Interrupted operation as its client disconnected","attr":{"opId":580}}
{"t":{"$date":"2022-11-08T18:56:59.543+00:00"},"s":"D3", "c":"-",        "id":4892201, "ctx":"conn2","msg":"Internal assertion","attr":{"error":{"code":279,"codeName":"ClientDisconnect","errmsg":"operation was interrupted"},"location":"{fileName:\"src/mongo/util/interruptible.h\", line:398, functionName:\"operator()\"}"}}
{"t":{"$date":"2022-11-08T18:56:59.543+00:00"},"s":"D2", "c":"NETWORK",  "id":22986,   "ctx":"conn3","msg":"Session from remote encountered a network error during SourceMessage","attr":{"remote":"10.20.31.176:57284","error":{"code":6,"codeName":"HostUnreachable","errmsg":"Connection closed by peer"}}}
{"t":{"$date":"2022-11-08T18:56:59.543+00:00"},"s":"I",  "c":"NETWORK",  "id":22944,   "ctx":"conn3","msg":"Connection ended","attr":{"remote":"10.20.31.176:57284","connectionId":3,"connectionCount":2}}
{"t":{"$date":"2022-11-08T18:56:59.543+00:00"},"s":"D1", "c":"COMMAND",  "id":21962,   "ctx":"conn2","msg":"Assertion while executing command","attr":{"command":"hello","db":"admin","commandArgs":{"hello":true,"helloOk":true,"maxAwaitTimeMS":10000,"topologyVersion":{"processId":{"$oid":"636aa6433c5de5c55d30c876"},"counter":0},"$db":"admin"},"error":"ClientDisconnect: operation was interrupted"}}
{"t":{"$date":"2022-11-08T18:56:59.543+00:00"},"s":"I",  "c":"COMMAND",  "id":51803,   "ctx":"conn2","msg":"Slow query","attr":{"type":"command","ns":"admin.$cmd","command":{"hello":true,"helloOk":true,"maxAwaitTimeMS":10000,"topologyVersion":{"processId":{"$oid":"636aa6433c5de5c55d30c876"},"counter":0},"$db":"admin"},"numYields":0,"ok":0,"errMsg":"operation was interrupted","errName":"ClientDisconnect","errCode":279,"reslen":117,"locks":{},"protocol":"op_msg","durationMillis":0}}

Test Script:

const mongodb = require("mongodb");

let url;
//url = 'mongodb://user:pass@10.20.11.28:3204/tracker';
//url = 'mongodb://user:pass@10.20.21.43:3204/tracker';
//url = 'mongodb://user:pass@10.20.31.176:3204/tracker';
url = 'mongodb:/user:pass@ninja-internal-network-1234567.elb.us-west-2.amazonaws.com:3204/tracker';

console.log(url);
dbclient = new mongodb.MongoClient(url, {auto_reconnect:true, ssl: false, poolSize: 16, keepAlive: true, retryWrites:false, useUnifiedTopology: true})
console.log(dbclient);

dbclient.connect(function(err) {
  if (err) {
    console.error("connection error:", err);
  }
  db = dbclient.db('tracker')
  db.collection('visitors', function(a,b){
    b.find({aid:7}).toArray(function(err, items) {
      console.log(items.length);
      console.log(items[0]);
      dbclient.close();
    });
  });
});
jsharpe
  • 2,546
  • 3
  • 26
  • 42
  • For the curious port `3204` is pretty arbitrary and it does map through to mongodb's 27017. – jsharpe Nov 08 '22 at 19:41
  • having the same issue, did you mange to get it working? – Kay May 22 '23 at 10:19
  • you know - I didn't. I just hardcoded our one ip/port (super small deployment right now) and today it bit me in the ass. guess it should get figured out – jsharpe May 23 '23 at 19:39

0 Answers0