We have production environment using jgroups 2.6.13. Cluster is created within a subnet with different machines. Jgroups view shows all members in cluster and works fine for few hours even sometimes couple of days. However then fails sending packets even when cluster shows all members in the cluster view.
What is best way to debug in production without stopping processes. Which utilities in jgroups 2.6.13 would help simulate this.
Below is the stack configuration
<config>
<UDP mcast_addr="${jgroups.udp.mcast_addr:224.0.0.37}" mcast_port="${jgroups.udp.mcast_port:45576}" tos="8" ucast_recv_buf_size="20000000" ucast_send_buf_size="640000" mcast_recv_buf_size="25000000" mcast_send_buf_size="640000" loopback="false" discard_incompatible_packets="true" max_bundle_size="64000" max_bundle_timeout="30" use_incoming_packet_handler="true" ip_ttl="${jgroups.udp.ip_ttl:2}" enable_bundling="true" enable_diagnostics="true" thread_naming_pattern="cl" use_concurrent_stack="true" thread_pool.enabled="true" thread_pool.min_threads="2" thread_pool.max_threads="8" thread_pool.keep_alive_time="5000" thread_pool.queue_enabled="true" thread_pool.queue_max_size="1000" thread_pool.rejection_policy="Run" oob_thread_pool.enabled="true" oob_thread_pool.min_threads="1" oob_thread_pool.max_threads="8" oob_thread_pool.keep_alive_time="5000" oob_thread_pool.queue_enabled="false" oob_thread_pool.queue_max_size="100" oob_thread_pool.rejection_policy="Run"/>
<SNPING timeout="2000" num_initial_members="3" bind_port="34343" subnets="10.30.21.0,10.30.25.0"/>
<MERGE2 max_interval="30000" min_interval="10000"/>
<FD_SOCK/>
<FD timeout="10000" max_tries="5" shun="true"/>
<VERIFY_SUSPECT timeout="1500"/>
<pbcast.NAKACK use_stats_for_retransmission="false" exponential_backoff="150" use_mcast_xmit="true" gc_lag="0" retransmit_timeout="50,300,600,1200" discard_delivered_msgs="true"/>
<UNICAST timeout="300,600,1200"/>
<pbcast.STABLE stability_delay="1000" desired_avg_gossip="50000" max_bytes="1000000"/>
<VIEW_SYNC avg_send_interval="60000"/>
<pbcast.GMS print_local_addr="true" join_timeout="3000" shun="false" view_bundling="true"/>
</config>