I'm trying to read Avro data using Spark Structured Streaming and Kafka. The code I am using is the following:
package com.test.spark
import com.test.spark.ConfigKafka.getAvroSchema
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
import org.apache.avro.io.DecoderFactory
import org.apache.spark.SparkContext
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{DataFrame, Dataset, Encoder, SparkSession}
object AvroConsumer extends App {
val sparkSession: SparkSession = SparkSession
.builder()
.getOrCreate()
val sparkContext: SparkContext = sparkSession
.sparkContext
sparkContext.setLogLevel("WARN")
private val topic: String = "topic-test"
private val queryName: String = "TEST_QUERY"
private val autoOffsetReset: String = "earliest"
val avroReader: GenericDatumReader[GenericRecord] = new GenericDatumReader[GenericRecord](getAvroSchema(topic))
val avroDecoderFactory: DecoderFactory = DecoderFactory.get()
implicit val encoder: Encoder[GenericRecord] = org.apache.spark.sql.Encoders.kryo[GenericRecord]
import sparkSession.implicits._
val kafkaDataFrame: DataFrame = sparkSession
.readStream
.format("kafka")
.option("subscribe", topic)
.option("group.id", queryName)
.option("startingOffsets", autoOffsetReset)
.options(ConfigKafka.getSparkConsumerProperties())
.load()
val data: Dataset[String] = kafkaDataFrame
.select(col("value").as[Array[Byte]])
.map(d => {
val rec: GenericRecord = avroReader.read(null, avroDecoderFactory.binaryDecoder(d, null))
val idContract: String = rec.get("idContract").asInstanceOf[org.apache.avro.util.Utf8].toString
println(s"idContract = $idContract")
idContract
})
data
.writeStream
.format("console")
.option("truncate", false)
.start()
.awaitTermination()
}
in the ConfigKafka
object, I manage all the connection and the retrieval of the Avro schema to the schema registry:
object ConfigKafka {
def getConfMap(): Map[String, String] = {
Map(AbrisConfig.SCHEMA_REGISTRY_URL -> schemaRegistryUrl,
SchemaRegistryClientConfig.USER_INFO_CONFIG -> s"${userAccount}:${Encoder.decode(userPassword)}",
SchemaRegistryClientConfig.BASIC_AUTH_CREDENTIALS_SOURCE -> "USER_INFO")
}
def getAvroSchema(inputTopic: String): Schema = {
val subjectName: String = s"$inputTopic-value"
import collection.JavaConverters._
val confMap: Map[String, String] = getConfMap()
val schemaRegistryClient: CachedSchemaRegistryClient = new CachedSchemaRegistryClient(schemaRegistryUrl, 128, confMap.asJava)
val stringSchema: String = schemaRegistryClient.getLatestSchemaMetadata(subjectName).getSchema
new Schema.Parser().parse(stringSchema)
}
}
The Avro schema of the messages I'm trying to read is:
{
"type": "record",
"name": "Contract",
"namespace": "com.test.data.contract",
"fields": [
{
"default": null,
"name": "entity",
"type": [
"null",
"string"
]
},
{
"default": null,
"name": "idContract",
"type": [
"null",
"string"
]
},
{
"default": null,
"name": "codeContract",
"type": [
"null",
"string"
]
}
]
}
At first, my goal is to read the idContract
field but I get a NullPointerException
error:
23/05/19 11:16:46 ERROR TaskSetManager: Task 0 in stage 0.0 failed 4 times; aborting job
23/05/19 11:16:46 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@5f3442ad is aborting.
23/05/19 11:16:46 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@5f3442ad aborted.
23/05/19 11:16:46 ERROR MicroBatchExecution: Query [id = c3424864-6f80-4c29-a316-50949782af69, runId = b4964f51-400b-44a4-8cbc-dc6a16a3922e] terminated with error
org.apache.spark.SparkException: Writing job aborted
...
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 times,
most recent failure: Lost task 0.3 in stage 0.0 (TID 3) (host.fr executor 1): java.lang.NullPointerException
at com.test.spark.AvroConsumer$.$anonfun$data$1(AvroConsumer.scala:41)
The error points the following line of my code:
val rec: GenericRecord = avroReader.read(null, avroDecoderFactory.binaryDecoder(d, null))
How can I deserialize the value properly?