I'm new to Azure Databricks and I'm trying implement an Azure Databricks Delta Live Table Pipeline that ingests from a Kafka topic containing messages where the values are SchemaRegistry encoded AVRO.
Work done so far...
Exercise to Consume and Write to a Delta Table
Using the example in Confluent Example, I've read the "raw" message via:
rawAvroDf = (
spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", confluentBootstrapServers)
.option("kafka.security.protocol", "SASL_SSL")
.option("kafka.sasl.jaas.config", "kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username='{}' password='{}';".format(confluentApiKey, confluentSecret))
.option("kafka.ssl.endpoint.identification.algorithm", "https")
.option("kafka.sasl.mechanism", "PLAIN")
.option("subscribe", confluentTopicName)
.option("startingOffsets", "earliest")
.option("failOnDataLoss", "false")
.load()
.withColumn('key', fn.col("key").cast(StringType()))
.withColumn('fixedValue', fn.expr("substring(value, 6, length(value)-5)"))
.withColumn('valueSchemaId', binary_to_string(fn.expr("substring(value, 2, 4)")))
.select('topic', 'partition', 'offset', 'timestamp', 'timestampType', 'key', 'valueSchemaId','fixedValue')
)
Created a SchemaRegistryClient:
from confluent_kafka.schema_registry import SchemaRegistryClient
import ssl
schema_registry_conf = {
'url': schemaRegistryUrl,
'basic.auth.user.info': '{}:{}'.format(confluentRegistryApiKey, confluentRegistrySecret)}
schema_registry_client = SchemaRegistryClient(schema_registry_conf)
Defined a deserialization function that looks up the schema ID from the start of the binary message:
import pyspark.sql.functions as fn
from pyspark.sql.avro.functions import from_avro
def parseAvroDataWithSchemaId(df, ephoch_id):
cachedDf = df.cache()
fromAvroOptions = {"mode":"FAILFAST"}
def getSchema(id):
return str(schema_registry_client.get_schema(id).schema_str)
distinctValueSchemaIdDF = cachedDf.select(fn.col('valueSchemaId').cast('integer')).distinct()
for valueRow in distinctValueSchemaIdDF.collect():
currentValueSchemaId = sc.broadcast(valueRow.valueSchemaId)
currentValueSchema = sc.broadcast(getSchema(currentValueSchemaId.value))
filterValueDF = cachedDf.filter(fn.col('valueSchemaId') == currentValueSchemaId.value)
filterValueDF \
.select('topic', 'partition', 'offset', 'timestamp', 'timestampType', 'key', from_avro('fixedValue', currentValueSchema.value, fromAvroOptions).alias('parsedValue')) \
.write \
.format("delta") \
.mode("append") \
.option("mergeSchema", "true") \
.save(deltaTablePath)
Finally written to a delta table:
rawAvroDf.writeStream \
.option("checkpointLocation", checkpointPath) \
.foreachBatch(parseAvroDataWithSchemaId) \
.queryName("clickStreamTestFromConfluent") \
.start()
Created a (Bronze/Landing) Delta Live Table
import dlt
import pyspark.sql.functions as fn
from pyspark.sql.types import StringType
@dlt.table(
name = "<<landingTable>>",
path = "<<storage path>>",
comment = "<< descriptive comment>>"
)
def landingTable():
jasConfig = "kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username='{}' password='{}';".format(confluentApiKey, confluentSecret)
binary_to_string = fn.udf(lambda x: str(int.from_bytes(x, byteorder='big')), StringType())
kafkaOptions = {
"kafka.bootstrap.servers": confluentBootstrapServers,
"kafka.security.protocol": "SASL_SSL",
"kafka.sasl.jaas.config": jasConfig,
"kafka.ssl.endpoint.identification.algorithm": "https",
"kafka.sasl.mechanism": "PLAIN",
"subscribe": confluentTopicName,
"startingOffsets": "earliest",
"failOnDataLoss": "false"
}
return (
spark
.readStream
.format("kafka")
.options(**kafkaOptions)
.load()
.withColumn('key', fn.col("key").cast(StringType()))
.withColumn('valueSchemaId', binary_to_string(fn.expr("substring(value, 2, 4)")))
.withColumn('avroValue', fn.expr("substring(value, 6, length(value)-5)"))
.select(
'topic',
'partition',
'offset',
'timestamp',
'timestampType',
'key',
'valueSchemaId',
'avroValue'
)
Help Required on:
- Ensure that the landing table is a
STREAMING LIVE TABLE
- Deserialize the avro encode message-value (a
STREAMING LIVE VIEW
calling a python UDF?)