I am upgrading from spark2.2 to 3.4.0 and following code is used to publish events to kafka using spark-structured streaming which works fine in both version and kafka received the events.
Problem here is KafkaSourceProvider implementation defined as another class
org.apache.spark.sql.kafka010.ServiceImpactKafkaSourceProvider
is not getting invoked that was working fine with spark 2.2
Any help really appreciated.
def execute(stateEvents: Dataset[StateChangeEventSto]): StreamingQuery = {
import stateEvents.sparkSession.implicits._
stateEvents.mapPartitions(it => {
logger.info("Start publishing state changes by partitions")
it.map(e => {
try {
(e.nodeInstanceId, objectMapper.writeValueAsString(Converter.toDto(e)))
}
catch
{ // !!! TEMPORARY !!! LOG TO HELP DEBUGGING VERY STRANGE EXCEPTION
case ex: Exception =>
logger.error(s"FATAL EXCEPTION on Node Instance id = ${e.nodeInstanceId} in PublishStateChangeToKafka", ex)
throw ex
}
})
}).toDF("key", "value")
.writeStream
.format("org.apache.spark.sql.kafka010.ServiceImpactKafkaSourceProvider")
.trigger(Trigger.ProcessingTime(settings.kafka.triggertime))
.option("kafka.bootstrap.servers", settings.kafka.brokers)
.option("kafka.max.request.size", settings.kafka.maxRequestSize)
.option("kafka.request.timeout.ms", settings.kafka.requestTimeoutMs)
.option("kafka.linger.ms", settings.kafka.lingerMs)
.option("kafka.batch.size", settings.kafka.batchSize)
.option("kafka.buffer.memory", settings.kafka.maxRequestSize)
.option("topic", settings.kafka.outputTopic)
.option("checkpointLocation", settings.kafka.checkpointLocation)
.outputMode(OutputMode.Append())
.queryName("State change events sink")
.start()
}
`