The following code is failing due to a schema, specifically nullability, mismatch issue. Is this expected behavior in Spark version: 3.2.2
and Delta version: 1.2.1
? Can this code be updated so that schema can be enforced?
import com.github.mrpowers.spark.fast.tests.DatasetComparer
import io.delta.tables.DeltaTable
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.commons.io.FileUtils
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions.{current_timestamp}
import org.apache.spark.sql.types._
import java.io.File
import java.nio.file.{Files, Paths}
import java.sql.Timestamp
import scala.collection.mutable
object DeltaMergeSparkApp extends App with DatasetComparer {
val spark = getSession
println("Spark version: " + spark.version)
println("Delta version: " + io.delta.VERSION)
val TargetSchema = StructType(Seq(
StructField("id", IntegerType, nullable = false),
StructField("name", StringType, nullable = false),
StructField("upd_ts", TimestampType, nullable = true),
StructField("upd_usr", StringType, nullable = true)
))
val sourceDf = createSourceDf()
println("Source dataframe (updates): ")
sourceDf.show()
sourceDf.printSchema()
val targetDf = createTargetDf()
println("Target table: ")
val tempDir = Paths.get("target/upsertHappyPath")
FileUtils.deleteDirectory(new File(tempDir.toString))
println(s"Create temp dir: $tempDir")
Files.createDirectory(tempDir)
println("Schema before save: ")
targetDf.printSchema()
println("Create Target table at " + tempDir)
targetDf.write.format("delta").save(tempDir.toString)
println("Schema after save: ")
spark.read.format("delta").load(tempDir.toString).printSchema()
val updateMap: Map[String, String] = Map(
"target.upd_ts" -> s"${current_timestamp()}",
"target.upd_usr" -> "'testUser'"
)
println("Call upsert ...")
upsert(
tempDir.toString,
sourceDf,
sourceExcludeCols = List("crt_ts", "crt_usr"),
mergeOnCols = List("id"),
partitionPruneCols = List(),
customUpdateExpr = updateMap)
println("Complete upsert")
val expectedDf = createDataframeFromCollection(TargetSchema, Seq(
Row(1, "Alice-update", Timestamp.valueOf("2023-01-02 00:00:00"), "user1"), //update
Row(2, "Bob", Timestamp.valueOf("2023-01-01 00:00:00"), "user2"), //unchanged
Row(3, "Charlie-insert", Timestamp.valueOf("2023-01-02 00:00:00"), "user2"), //new insert
))
println("Show target table after updates")
val actualDf = spark.read.format("delta").load(tempDir.toString)
actualDf.show()
actualDf.printSchema()
assertSmallDatasetEquality(actualDf, expectedDf)
def createTargetDf(): DataFrame = {
createDataframeFromCollection(TargetSchema, Seq(
Row(1, "Alice", Timestamp.valueOf("2023-01-01 00:00:00"), "user1"),
Row(2, "Bob", Timestamp.valueOf("2023-01-01 00:00:00"), "user2"),
))
}
def createSourceDf(): DataFrame = {
val schema = StructType(Seq(
StructField("id", IntegerType, nullable = false),
StructField("name", StringType, nullable = false),
StructField("crt_ts", TimestampType, nullable = true),
StructField("crt_usr", StringType, nullable = true)
))
val data = Seq(
Row(1, "Alice-update", Timestamp.valueOf("2023-01-02 00:00:00"), "user1"), //update
Row(3, "Charlie-insert", Timestamp.valueOf("2023-01-02 00:00:00"), "user2"), //insert
)
createDataframeFromCollection(schema, data)
}
private def createDataframeFromCollection(schema: StructType, data: Seq[Row]) = {
getSession.createDataFrame(getSession.sparkContext.parallelize(data), schema)
}
private def getSession = {
SparkSession.builder()
.appName("DeltaMergeSparkApp")
.config("spark.driver.bindAddress", "127.0.0.1")
.master("local[2]")
.getOrCreate()
}
def upsert(targetPath: String, sourceDf: DataFrame, sourceExcludeCols: List[String] = List(),
mergeOnCols: List[String], partitionPruneCols: List[String] = List(),
customUpdateExpr: Map[String, String] = Map()): Unit = {
println(s"targetPath: $targetPath, sourceExcludeCols: $sourceExcludeCols, mergeOnCols: $mergeOnCols")
val targetTable = DeltaTable.forPath(SynapseSpark.getActiveSession, targetPath)
val mergeCols = sourceDf.columns.filter(col => !sourceExcludeCols.contains(col))
val updateExpr = mergeCols.map(colName => (s"target.$colName", s"source.$colName")).toMap
val updateMapExpr = updateExpr ++ customUpdateExpr
val insertExpr: mutable.Map[String, String] = mutable.Map[String, String]() ++= updateMapExpr
println(s"updateExpr: ${updateMapExpr}")
println(s"insertExpr: ${insertExpr}")
val mergeCondition = mergeOnCols.map(col => s"target.$col = source.$col").mkString(" AND ")
println(s"mergeCondition: $mergeCondition")
targetTable.as("target")
.merge(sourceDf.as("source"), mergeCondition)
.whenMatched
.updateExpr(updateMapExpr)
.whenNotMatched
.insertExpr(insertExpr)
.execute()
}
}
Error: Mismatch is in the first two columns.
Exception in thread "main" com.github.mrpowers.spark.fast.tests.DatasetSchemaMismatch:
Actual Schema Field | Expected Schema Field
StructField(id,IntegerType,true) | StructField(id,IntegerType,false)
StructField(name,StringType,true) | StructField(name,StringType,false)
StructField(upd_ts,TimestampType,true) | StructField(upd_ts,TimestampType,true)
StructField(upd_usr,StringType,true) | StructField(upd_usr,StringType,true)
Log
Spark version: 3.2.2
Delta version: 1.2.1
Source dataframe (updates):
+---+--------------+-------------------+-------+
| id| name| crt_ts|crt_usr|
+---+--------------+-------------------+-------+
| 1| Alice-update|2023-01-02 00:00:00| user1|
| 3|Charlie-insert|2023-01-02 00:00:00| user2|
+---+--------------+-------------------+-------+
root
|-- id: integer (nullable = false)
|-- name: string (nullable = false)
|-- crt_ts: timestamp (nullable = true)
|-- crt_usr: string (nullable = true)
Target table:
Create temp dir: target/upsertHappyPath
Schema before save:
root
|-- id: integer (nullable = false)
|-- name: string (nullable = false)
|-- upd_ts: timestamp (nullable = true)
|-- upd_usr: string (nullable = true)
Create Target table at target/upsertHappyPath
Schema after save:
root
|-- id: integer (nullable = true)
|-- name: string (nullable = true)
|-- upd_ts: timestamp (nullable = true)
|-- upd_usr: string (nullable = true)
Call upsert ...
targetPath: target/upsertHappyPath, sourceExcludeCols: List(crt_ts, crt_usr), mergeOnCols: List(id)
updateExpr: Map(target.id -> source.id, target.name -> source.name, target.upd_ts -> current_timestamp(), target.upd_usr -> 'testUser')
insertExpr: Map(target.name -> source.name, target.upd_ts -> current_timestamp(), target.id -> source.id, target.upd_usr -> 'testUser')
mergeCondition: target.id = source.id
Complete upsert
Show target table after updates
+---+--------------+--------------------+--------+
| id| name| upd_ts| upd_usr|
+---+--------------+--------------------+--------+
| 1| Alice-update|2023-06-20 12:14:...|testUser|
| 3|Charlie-insert|2023-06-20 12:14:...|testUser|
| 2| Bob| 2023-01-01 00:00:00| user2|
+---+--------------+--------------------+--------+
root
|-- id: integer (nullable = true)
|-- name: string (nullable = true)
|-- upd_ts: timestamp (nullable = true)
|-- upd_usr: string (nullable = true)
Exception in thread "main" com.github.mrpowers.spark.fast.tests.DatasetSchemaMismatch:
Actual Schema Field | Expected Schema Field
StructField(id,IntegerType,true) | StructField(id,IntegerType,false)
StructField(name,StringType,true) | StructField(name,StringType,false)
StructField(upd_ts,TimestampType,true) | StructField(upd_ts,TimestampType,true)
StructField(upd_usr,StringType,true) | StructField(upd_usr,StringType,true)
at com.github.mrpowers.spark.fast.tests.DatasetComparer.assertSmallDatasetEquality(DatasetComparer.scala:87)