i've been tried to working on spark streaming. My problem is I want to use wordCountsDataFrame again outside of the foreach block.
i want to conditionally join wordCountsDataFrame and another dataframe that is created from Dstream. Is there any way to do that or another approach?
Thanks.
My scala code block is below.
val Seq(projectId, subscription) = args.toSeq
val sparkConf = new SparkConf().setAppName("PubsubWordCount")
val ssc = new StreamingContext(sparkConf, Milliseconds(5000))
val credentail = SparkGCPCredentials.builder.build()
val pubsubStream: ReceiverInputDStream[SparkPubsubMessage] = PubsubUtils.createStream(ssc, projectId, None, subscription, credentail, StorageLevel.MEMORY_AND_DISK_SER_2)
val stream1= pubsubStream.map(message => new String(message.getData()))
stream1.foreachRDD{ rdd =>
val spark = SparkSession.builder.config(rdd.sparkContext.getConf).getOrCreate()
import spark.implicits._
// Convert RDD[String] to DataFrame
val wordsDataFrame = rdd.toDF("word")
wordsDataFrame.createOrReplaceTempView("words")
val wordCountsDataFrame =
spark.sql("select word, count(*) from words group by word")
wordCountsDataFrame.show()
}