0

I am using spark streaming to read from AMQ. I want the streaming to stop when no data is left in the message queue. I created a custom receiver that connects to the AMQ topic and starts reading the data, but how can the worker tell the driver that there is no data left so that it can stop the streaming.

class CustomReceiver(brokerURL, topic, ...){

    def onStart() {
      new Thread("AMQ Receiver") {
        override def run() { receive() }
      }.start()
    }

    def onStop() {}

    private def receive() {
      activeMQStream = new ActiveMQStream(broker, topic, ...)
      val topicSubscriber = activeMQStream.getTopicSubscriber()

      while(!isStopped && !ActiveMQReceiver.stop){
         val message = topicSubscriber.receive(timeOutInMilliseconds)
         if (message != null && message.isInstanceOf[TextMessage]) {
             val textMessage = message.asInstanceOf[TextMessage];
             val text = textMessage.getText();
             store(text)
             println("ActiveMQReceiver: there is data from AMQ ....")
         } else {
             ActiveMQReceiver.stop = true
             println("ActiveMQReceiver: No more data from AMQ .....")
         }
    }

    def checkStatus(): Boolean ={
        ActiveMQReceiver.stop
    }

}

object ActiveMQReceiver{
  @volatile var stop: Boolean = false
}

As you can see above I am trying to set stop flag to true when no data is left to read but when I run the following, the flag is always False, which after searching I found that workers don't share variables. I tried to replace it with Accumulator but that didn't work either.

var ssc = new StreamingContext(spark.sparkContext, Seconds(1))
val customReceiver = new CustomReceiver(brokerURL, topic, ...)
val stream: DStream[String] = ssc.receiverStream(customReceiver)
var driverList = List[String]()
stream.foreachRDD { rdd =>
  if(rdd.count() > 0){
    val fromWorker = rdd.collect().toList
    driverList = driverList:::fromWorker
  }
} 

var stopFlag = false
var isStopped = false
val checkIntervalMillis = 10000
while (!isStopped) {
  isStopped = ssc.awaitTerminationOrTimeout(checkIntervalMillis)
  println("Check if stop flag was raised")
  stopFlag = customReceiver.checkStatus()

  if (!isStopped && stopFlag) {
    var seq = driverList.toSeq
    import spark.implicits._
    val df = seq.toDS()
    println("Request to stop")
    ssc.stop(false, true)
  }
}
deltascience
  • 3,321
  • 5
  • 42
  • 71

1 Answers1

0

Relying on the receive() returning null to signal there is no data left is going to be dodgy in production. That approach removes any self-healing and failover support, as well as introduces a timing / race condition where you could just get 'unlucky'. As an alternative, look at using Message Groups and set the header on the last message in the stream to signal using a well-defined message instead.

Message Groups

Matt Pavlovich
  • 4,087
  • 1
  • 9
  • 17