I am experiencing issue with our jgroups cluster loosing each other when a few cluster members restart. We have 13 nodes in the cluster and all of them are in the same subnet . When restarting 4 of the nodes the entire cluster breaks down. All of the of the members stop recognizing each other, and the existing members which were not restarted also do not find each other.
We start getting the SUSPECT messages and failed to collect all ACKs
0;33mWARN [Incoming-1,broadcast,node-12] [GMS] node-12: failed to collect all ACKs (expected=11) for view [node-12|27] after 2000ms, missing 11 ACKs from node-12, node-4, node-6, node-13, node-11, node-2, node-7, node-8, node-9, node-0, node-3
0;33mWARN [INT-2,broadcast,node-12] [FD] node-12: I was suspected by node-5; ignoring the SUSPECT message and sending back a HEARTBEAT_ACK
PFB the configuration we are using , please let me know if there any issue with the configuration . We are using 3.4.1.Final version of JGroups
<TCP loopback="true"
recv_buf_size="${tcp.recv_buf_size:20M}"
send_buf_size="${tcp.send_buf_size:640K}"
discard_incompatible_packets="true"
max_bundle_size="64K"
max_bundle_timeout=“5"
enable_bundling="true"
use_send_queues="true"
sock_conn_timeout="300"
timer_type="new"
timer.min_threads="4"
timer.max_threads="10"
timer.keep_alive_time="3000"
timer.queue_max_size="500"
thread_pool.enabled="true"
thread_pool.min_threads="1"
thread_pool.max_threads="10"
thread_pool.keep_alive_time="5000"
thread_pool.queue_enabled=“true"
thread_pool.queue_max_size="100000"
thread_pool.rejection_policy="discard"
oob_thread_pool.enabled="true"
oob_thread_pool.min_threads="1"
oob_thread_pool.max_threads="8"
oob_thread_pool.keep_alive_time="5000"
oob_thread_pool.queue_enabled="false"
oob_thread_pool.queue_max_size="100"
oob_thread_pool.rejection_policy="discard"
bind_addr="${jgroups.bind_addr}"
bind_port="${jgroups.bind_port}" />
<JDBC_PING connection_driver="${database.driver}"
connection_password="${database.password}"
connection_username="${database.user}"
connection_url="${database.url}"
initialize_sql="${jgroups.schema}"
datasource_jndi_name="${datasource.jndi.name}"/>
<MERGE2 min_interval="10000" max_interval="30000" />
<FD_SOCK />
<FD timeout="3000" max_tries="3" />
<VERIFY_SUSPECT timeout="1500" />
<BARRIER />
<pbcast.NAKACK use_mcast_xmit="false" exponential_backoff="500" discard_delivered_msgs="true" />
<UNICAST2 />
<pbcast.STABLE stability_delay="1000" desired_avg_gossip="50000" max_bytes="4M" />
<pbcast.GMS print_local_addr="true" join_timeout="3000" view_bundling="true" />
<UFC max_credits="20M" min_threshold="0.4" />
<MFC max_credits="20M" min_threshold="0.4" /`enter code here`>
<FRAG2 frag_size="60K" />
<pbcast.STATE_TRANSFER />