0

I am upgrading from spark2.2 to 3.4.0 and following code is used to publish events to kafka using spark-structured streaming which works fine in both version and kafka received the events.

Problem here is KafkaSourceProvider implementation defined as another class

org.apache.spark.sql.kafka010.ServiceImpactKafkaSourceProvider

is not getting invoked that was working fine with spark 2.2

Any help really appreciated.

def execute(stateEvents: Dataset[StateChangeEventSto]): StreamingQuery = {
  import stateEvents.sparkSession.implicits._

  stateEvents.mapPartitions(it => {
    logger.info("Start publishing state changes by partitions")

    it.map(e => {

      try {
        (e.nodeInstanceId, objectMapper.writeValueAsString(Converter.toDto(e)))
      }
      catch
      { // !!! TEMPORARY !!! LOG TO HELP DEBUGGING VERY STRANGE EXCEPTION
        case ex: Exception =>
          logger.error(s"FATAL EXCEPTION on Node Instance id = ${e.nodeInstanceId} in PublishStateChangeToKafka", ex)
          throw ex
      }
    })
  }).toDF("key", "value")
    .writeStream
    .format("org.apache.spark.sql.kafka010.ServiceImpactKafkaSourceProvider")
    .trigger(Trigger.ProcessingTime(settings.kafka.triggertime))
    .option("kafka.bootstrap.servers", settings.kafka.brokers)
    .option("kafka.max.request.size", settings.kafka.maxRequestSize)
    .option("kafka.request.timeout.ms", settings.kafka.requestTimeoutMs)
    .option("kafka.linger.ms", settings.kafka.lingerMs)
    .option("kafka.batch.size", settings.kafka.batchSize)
    .option("kafka.buffer.memory", settings.kafka.maxRequestSize)
    .option("topic", settings.kafka.outputTopic)
    .option("checkpointLocation", settings.kafka.checkpointLocation)
    .outputMode(OutputMode.Append())
    .queryName("State change events sink")
    .start()
}
`
Chandan Gawri
  • 364
  • 1
  • 4
  • 15

0 Answers0