I am using Spark Structured Streaming with Azure Databricks Delta where I am writing to Delta table (delta table name is raw).I am reading from Azure files where I am receiving out of order data and I have 2 columns in it "smtUidNr
" and "msgTs
".I am trying to handle duplicates by using Upsert in my code but when I query my delta table "raw
". I see following duplicate records in my delta table
smtUidNr msgTs
57A94ADA218547DC8AE2F3E7FB14339D 2019-08-26T08:58:46.000+0000
57A94ADA218547DC8AE2F3E7FB14339D 2019-08-26T08:58:46.000+0000
57A94ADA218547DC8AE2F3E7FB14339D 2019-08-26T08:58:46.000+0000
Following is my code:
import org.apache.spark._
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
// merge duplicates
def upsertToDelta(microBatchOutputDF: DataFrame, batchId: Long) {
microBatchOutputDF.createOrReplaceTempView("updates")
microBatchOutputDF.sparkSession.sql(s"""
MERGE INTO raw t
USING updates s
ON (s.smtUidNr = t.smtUidNr and s.msgTs>t.msgTs)
WHEN MATCHED THEN UPDATE SET *
WHEN NOT MATCHED THEN INSERT *
""")
}
val df=spark.readStream.format("delta").load("abfss://abc@hjklinfo.dfs.core.windows.net/entrypacket/")
df.createOrReplaceTempView("table1")
val entrypacket_DF=spark.sql("""SELECT details as dcl,invdetails as inv,eventdetails as evt,smtdetails as smt,msgHdr.msgTs,msgHdr.msgInfSrcCd FROM table1 LATERAL VIEW explode(dcl) dcl AS details LATERAL VIEW explode(inv) inv AS invdetails LATERAL VIEW explode(evt) evt as eventdetails LATERAL VIEW explode(smt) smt as smtdetails""").dropDuplicates()
entrypacket_DF.createOrReplaceTempView("ucdx")
//Here, we are adding a column date_timestamp which converts msgTs timestamp to YYYYMMDD format in column date_timestamp which eliminates duplicate for today & then we drop this column meaning which we are not tampering with msgTs column
val resultDF=spark.sql("select dcl.smtUidNr,dcl,inv,evt,smt,cast(msgTs as timestamp)msgTs,msgInfSrcCd from ucdx").withColumn("date_timestamp",to_date(col("msgTs"))).dropDuplicates(Seq("smtUidNr","date_timestamp")).drop("date_timestamp")
resultDF.createOrReplaceTempView("final_tab")
val finalDF=spark.sql("select distinct smtUidNr,max(dcl) as dcl,max(inv) as inv,max(evt) as evt,max(smt) as smt,max(msgTs) as msgTs,max(msgInfSrcCd) as msgInfSrcCd from final_tab group by smtUidNr")
finalDF.writeStream.format("delta").foreachBatch(upsertToDelta _).outputMode("update").start()
Structured Streaming does not support aggregation,window function & order by clause? What can I do to modify in my code so that I can have only 1 record of particular smtUidNr?