Hi am having a spark streaming program which is reading the events from eventhub and pushing it topics. for processing each batch it is taking almost 10 times the batch time.
when am trying to implement multithreading am not able to see much difference in the processing time.
Is there any way by which I can increase the performance either by doing parallel processing. or start some 1000 threads at a time and just keep pushing the messages.
class ThreadExample(msg:String) extends Thread{
override def run {
var test = new PushToTopicDriver(msg)
test.push()
// println(msg)
}
}
object HiveEventsDirectStream {
def b2s(a: Array[Byte]): String = new String(a)
def main(args: Array[String]): Unit = {
val eventhubnamespace = "namespace"
val progressdir = "/Event/DirectStream/"
val eventhubname_d = "namespacestream"
val ehParams = Map[String, String](
"eventhubs.policyname" -> "PolicyKeyName",
"eventhubs.policykey" -> "key",
"eventhubs.namespace" -> "namespace",
"eventhubs.name" -> "namespacestream",
"eventhubs.partition.count" -> "30",
"eventhubs.consumergroup" -> "$default",
"eventhubs.checkpoint.dir" -> "/EventCheckpoint_0.1",
"eventhubs.checkpoint.interval" -> "2"
)
println("testing spark")
val conf = new SparkConf().set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").setMaster("local[4]").setAppName("Eventhubs_Test")
conf.registerKryoClasses(Array(classOf[PublishToTopic]))
conf.set("spark.streaming.stopGracefullyOnShutdown", "true")
val sc= new SparkContext(conf)
val hiveContext = new HiveContext(sc)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val pool:ExecutorService=Executors.newFixedThreadPool(30)
val ssc = new StreamingContext(sc, Seconds(2))
var dataString :RDD[String] =sc.emptyRDD
val stream=EventHubsUtils.createDirectStreams(ssc,eventhubnamespace,progressdir,Map(eventhubname_d -> ehParams))
val kv1 = stream.map(receivedRecord => (new String(receivedRecord.getBody))).persist()
kv1.foreachRDD(rdd_1 => rdd_1.foreachPartition(line => line.foreach(msg => {var t1 = new ThreadExample(msg) t1.start()})))
ssc.start()
ssc.awaitTermination()
}
}
Thanks,
Ankush Reddy.