When I perform a Kafka write stream to a table in Databricks, the incoming data doesn't increase the table size significantly, but it results in a much larger increase in the data size on Blob storage.
val kafkaBrokers="<BROKERS>"
val kafkaTopic="<TOPIC>"
val kafkaStartingOffset="earliest"
val groupIdPrefix="<PREFIX>"
val format="delta"
val outputMode="update"
import org.apache.spark.sql._
import io.delta.tables._
val deltaTable = DeltaTable.forName("table_name")
def upsertToDelta(microBatchOutputDF: DataFrame, batchId: Long) {
deltaTable.as("dt")
.merge(
microBatchOutputDF.as("stream"),
"stream.system_device_id = dt.system_device_id and stream.uid_name=dt.uid_name")
.whenMatched("stream.is_deleted=true").updateExpr(Map("is_deleted" -> "stream.is_deleted",
"data_date_time" -> "stream.data_date_time",
"process_time"->"stream.process_time",
"uninstall_date"->"stream.uninstall_date",
"application_id"->"stream.application_id"))
.whenMatched().updateAll()
.whenNotMatched().insertAll()
.execute()
}
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.functions._
val schema = deltaTable.toDF.schema
val confCheckpointLocation="dbfs:/mnt/user/location"
spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers",kafkaBrokers)
.option("subscribe", kafkaTopic)
.option("startingOffsets", kafkaStartingOffset)
.option("groupIdPrefix",groupIdPrefix)
.load()
.select($"offset".alias("offset"), from_json($"value".cast("string"),schema).alias("value"))
.select($"value.system_device_id".alias("system_device_id"),
$"value.uid_name".alias("uid_name"),
struct(col("offset"),col("value")).alias("record")
).groupBy("system_device_id","uid_name").agg(max("record").as("record"))
.select("record.value.*")
.writeStream
.format(format)
.foreachBatch(upsertToDelta _)
.outputMode(outputMode)
.option("checkpointLocation",confCheckpointLocation)
.trigger(Trigger.AvailableNow)
.start()
Thanks in advance if anyone has a solution.
I expected the size of the data to be the same as the table or slightly larger, but there are differences ranging from 3 to 5 times larger with each run.