1

I've a 3 node infinispan cluster with numOwners=2 and I'm running into issues with cluster views when one of the node gets disconnected from the network and joins back. Following are the logs:

(Incoming-1,BrokerPE-0-28575) ISPN000094: Received new cluster view for channel ISPN: [BrokerPE-0-28575|2] (3) [BrokerPE-0-28575, SEM03VVM-201-59385, SEM03VVM-202-33714]

ISPN000094: Received new cluster view for channel ISPN: [BrokerPE-0-28575|3] (2) [BrokerPE-0-28575, SEM03VVM-202-33714] --> one node disconnected

ISPN000093: Received new, MERGED cluster view for channel ISPN: MergeView::[BrokerPE-0-28575|4] (2) [BrokerPE-0-28575, SEM03VVM-201-59385], 2 subgroups: [BrokerPE-0-28575|3] (2) [BrokerPE-0-28575, SEM03VVM-202-33714], [BrokerPE-0-28575|2] (3) [BrokerPE-0-28575, SEM03VVM-201-59385, SEM03VVM-202-33714] --> incorrect merge

Following is my jgroups config:

<config xmlns="urn:org:jgroups"
        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
        xsi:schemaLocation="urn:org:jgroups http://www.jgroups.org/schema/jgroups-3.6.xsd">
   <TCP
          bind_addr="${jgroups.tcp.address:127.0.0.1}"
        bind_port="${jgroups.tcp.port:7800}"
        loopback="true"
        port_range="30"
        recv_buf_size="20m"
        send_buf_size="640k"
        max_bundle_size="31k"
        use_send_queues="true"
        enable_diagnostics="false"
        sock_conn_timeout="300"
        bundler_type="old"

        thread_naming_pattern="pl"

        timer_type="new3"
         timer.min_threads="4"
         timer.max_threads="10"
         timer.keep_alive_time="3000"
         timer.queue_max_size="500"


        thread_pool.enabled="true"
        thread_pool.min_threads="2"
        thread_pool.max_threads="30"
        thread_pool.keep_alive_time="60000"
        thread_pool.queue_enabled="true"
        thread_pool.queue_max_size="100"
        thread_pool.rejection_policy="Discard"

        oob_thread_pool.enabled="true"
        oob_thread_pool.min_threads="2"
        oob_thread_pool.max_threads="30"
        oob_thread_pool.keep_alive_time="60000"
        oob_thread_pool.queue_enabled="false"
        oob_thread_pool.queue_max_size="100"
        oob_thread_pool.rejection_policy="Discard"

        internal_thread_pool.enabled="true"
        internal_thread_pool.min_threads="1"
        internal_thread_pool.max_threads="10"
        internal_thread_pool.keep_alive_time="60000"
        internal_thread_pool.queue_enabled="true"
        internal_thread_pool.queue_max_size="100"
        internal_thread_pool.rejection_policy="Discard"
        />

   <!-- Ergonomics, new in JGroups 2.11, are disabled by default in TCPPING until JGRP-1253 is resolved -->
   <TCPPING timeout="3000" initial_hosts="${jgroups.tcpping.initial_hosts:HostA[7800],HostB[7801]}"
            port_range="2"
            num_initial_members="3"
            ergonomics="false"
        /> 

   <!-- MPING bind_addr="${jgroups.bind_addr:127.0.0.1}" break_on_coord_rsp="true"
      mcast_addr="${jboss.default.multicast.address:228.2.4.6}"
      mcast_port="${jgroups.mping.mcast_port:43366}"
      ip_ttl="${jgroups.udp.ip_ttl:2}"
      num_initial_members="3"/-->
     <!--  <MPING bind_addr="${jgroups.bind_addr:127.0.0.1}" break_on_coord_rsp="true"
      mcast_addr="${jboss.default.multicast.address:228.2.4.6}"
      mcast_port="${jgroups.mping.mcast_port:43366}"
      ip_ttl="${jgroups.udp.ip_ttl:2}"
      num_initial_members="3"/> -->
   <MERGE3 max_interval="30000" min_interval="10000"/>

   <FD_SOCK bind_addr="${jgroups.bind_addr}"/> 
    <FD timeout="3000" max_tries="3"/> 
   <VERIFY_SUSPECT timeout="3000"/>
  <!--  <BARRIER /> -->
    <!-- <pbcast.NAKACK use_mcast_xmit="false" retransmit_timeout="300,600,1200,2400,4800" discard_delivered_msgs="true"/> -->
   <pbcast.NAKACK2 use_mcast_xmit="false"
                   xmit_interval="1000"
                   xmit_table_num_rows="100"
                   xmit_table_msgs_per_row="10000"
                   xmit_table_max_compaction_time="10000"
                   max_msg_batch_size="100" discard_delivered_msgs="true"/>
   <UNICAST3 xmit_interval="500"
             xmit_table_num_rows="20"
             xmit_table_msgs_per_row="10000"
             xmit_table_max_compaction_time="10000"
             max_msg_batch_size="100"
             conn_expiry_timeout="0"/>

   <pbcast.STABLE stability_delay="1000" desired_avg_gossip="50000" max_bytes="400000"/>
   <pbcast.GMS print_local_addr="true" join_timeout="3000" view_bundling="true" merge_timeout="6000"/>
   <tom.TOA/> <!-- the TOA is only needed for total order transactions-->

    <UFC max_credits="2m" min_threshold="0.40"/> 
   <!-- <MFC max_credits="2m" min_threshold="0.40"/> -->
   <FRAG2 frag_size="30k"/>
    <RSVP timeout="60000" resend_interval="500" ack_on_delivery="false" /> 
   <!-- <pbcast.STATE_TRANSFER/> -->
</config>

I'm using Infinispan 7.0.2 and jgroups 3.6.1 version. I've tried a lot of configs but nothing worked. Your help would be much appreciated.

[UPDATE] Things worked fine after setting the following property to more than 1 : "internal_thread_pool.min_threads".

geekprogrammer
  • 1,108
  • 1
  • 13
  • 39

1 Answers1

1

So to simplify this, we have

  • View broker|2={broker,201,202}
  • 201 leaves, the view is now broker|3={broker,202}
  • Then there is a merge between views broker|3 and broker|2, which leads to incorrect view broker|4={broker,201}

I created [1] to investigate what's going on here. First off, the subviews of the merge view should have included 202 being a subgroup coordinator, but that wasn't the case.

Can you describe what exactly happened here? Can this be reproduced? It would be nice to have TRACE level logs for FD,FD_ALL, MERGE3 and GMS...

[1] https://issues.jboss.org/browse/JGRP-2128

Bela Ban
  • 2,186
  • 13
  • 12
  • Yes, this is reproducible all the time in our environment when we manually disconnect one of our nodes from the network and connect it back. Thanks for creating the bug;I'll add the trace logs to it. – geekprogrammer Nov 10 '16 at 14:01