spark streaming serialization error

Question

I am running into serialization error in spark-streaming application. Below is my driver code:

package com.test
import org.apache.spark._
import org.apache.spark.streaming._
import org.json.JSONObject;
import java.io.Serializable


object SparkFiller extends Serializable{
 def main(args: Array[String]): Unit ={
 val sparkConf = new 
SparkConf().setAppName("SparkFiller").setMaster("local[*]")
// println("test")
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sparkConf.registerKryoClasses(Array(classOf[firehoseToDocumentDB]))
sparkConf.registerKryoClasses(Array(classOf[PushToDocumentDB]))
var TimeStamp_Start = 1493836050
val TimeStamp_Final = 1493836056
var timeStamp_temp = TimeStamp_Start - 5;
// val send_timestamps = new firehoseToDocumentDB(TimeStamp_Start,TimeStamp_Final);
// send_timestamps.onStart();
val ssc = new StreamingContext(sparkConf, Seconds(5))
val lines = ssc.receiverStream(
new firehoseToDocumentDB(TimeStamp_Start.toString(),TimeStamp_Final.toString()))
// val timestamp_stream = ssc.receiverStream(new firehoseToDocumentDB(TimeStamp_Start.toString(),TimeStamp_Final.toString()))
lines.foreachRDD(rdd => {
  rdd.foreachPartition(part => {
    val dbsender = new PushToDocumentDB();
    part.foreach(msg =>{
      var jsonobject = new JSONObject(part)
      var temp_pitr = jsonobject.getString("pitr")
      println(temp_pitr)
      if ( TimeStamp_Final >= temp_pitr.toLong) {
        ssc.stop()
      }
      dbsender.PushFirehoseMessagesToDocumentDb(msg)
    })
    // dbsender.close()
  })
})

println("line",line)))
println("ankush")
ssc.start()
ssc.awaitTermination()
 }

}

When I add the below lines to the code

      var jsonobject = new JSONObject(part)
      var temp_pitr = jsonobject.getString("pitr")
      println(temp_pitr)
      if ( TimeStamp_Final >= temp_pitr.toLong) {
        ssc.stop()
      }

I get an error:

  Exception in thread "main" org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:304)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:294)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:122)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2055)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:919)
at org.apache.spark.rdd.RDD$$anonfun$foreachPartition$1.apply(RDD.scala:918)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
at org.apache.spark.rdd.RDD.foreachPartition(RDD.scala:918)
at com.boeing.SparkFiller$$anonfun$main$1.apply(SparkFiller.scala:26)
at com.boeing.SparkFiller$$anonfun$main$1.apply(SparkFiller.scala:25)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:661)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:426)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at scala.util.Try$.apply(Try.scala:161)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:224)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:224)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:223)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
 Caused by: java.io.NotSerializableException: org.apache.spark.streaming.StreamingContext
 Serialization stack:
- object not serializable (class: 
org.apache.spark.streaming.StreamingContext, value: 
org.apache.spark.streaming.StreamingContext@780e1bb5)
- field (class: com.boeing.SparkFiller$$anonfun$main$1, name: ssc$1, type: 
class org.apache.spark.streaming.StreamingContext)
- object (class com.boeing.SparkFiller$$anonfun$main$1, <function1>)
- field (class: com.boeing.SparkFiller$$anonfun$main$1$$anonfun$apply$1, 
name: $outer, type: class com.boeing.SparkFiller$$anonfun$main$1)
- object (class com.boeing.SparkFiller$$anonfun$main$1$$anonfun$apply$1, <function1>)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:47)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:301)
... 30 more

Process finished with exit code 1

If I remove those lines of code it is working good.

The issue is because of using the ssc.stop() in the rdd. Is there any otherway that I can call a shutdown hook from the rdd if it satisfies the condition.

You cannot execute `ssc.stop()` while inside the `foreachRDD`, you should wait the completion. — freedev, May 03 '17 at 23:00
@freedev the reason why I kept is I want to stop the spark execution if there is it satisfies the if condition. — ankush reddy, May 03 '17 at 23:17
@freedev is there a way that I can stop the streaming if it is in the if condition. ssc.stop() is causing the issue. Thanks. — ankush reddy, May 03 '17 at 23:24

score 0 · Answer 1 · edited May 23 '17 at 12:10

Issue is because of using the ssc.stop() in the rdd.

You are right! Any of the Spark contexts are not serializable and cannot be used inside any of the tasks.

is there any otherway that I can call a shutdown hook from the rdd if it satisfies the condition.

In order to control the lifecycle of your streaming application, you should consider overriding a listener and stop the context based on your condition. I have done enough research and found out that this the only feasible solution.

Please refer to my answer to this post to understand how to stop the streaming application based on certain condition.

spark streaming serialization error

1 Answers1