Spark Delta schema mismatch after save

Question

The following code is failing due to a schema, specifically nullability, mismatch issue. Is this expected behavior in Spark version: 3.2.2 and Delta version: 1.2.1? Can this code be updated so that schema can be enforced?

import com.github.mrpowers.spark.fast.tests.DatasetComparer
import io.delta.tables.DeltaTable
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.commons.io.FileUtils
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions.{current_timestamp}
import org.apache.spark.sql.types._

import java.io.File
import java.nio.file.{Files, Paths}
import java.sql.Timestamp
import scala.collection.mutable

object DeltaMergeSparkApp extends App with DatasetComparer {
  val spark = getSession
  println("Spark version: " + spark.version)
  println("Delta version: " + io.delta.VERSION)

  val TargetSchema = StructType(Seq(
    StructField("id", IntegerType, nullable = false),
    StructField("name", StringType, nullable = false),
    StructField("upd_ts", TimestampType, nullable = true),
    StructField("upd_usr", StringType, nullable = true)
  ))


  val sourceDf = createSourceDf()
  println("Source dataframe (updates): ")
  sourceDf.show()
  sourceDf.printSchema()

  val targetDf = createTargetDf()
  println("Target table: ")

  val tempDir = Paths.get("target/upsertHappyPath")

  FileUtils.deleteDirectory(new File(tempDir.toString))
  println(s"Create temp dir:  $tempDir")
  Files.createDirectory(tempDir)

  println("Schema before save: ")
  targetDf.printSchema()

  println("Create Target table at " + tempDir)
  targetDf.write.format("delta").save(tempDir.toString)
  println("Schema after save: ")
  spark.read.format("delta").load(tempDir.toString).printSchema()

  val updateMap: Map[String, String] = Map(
    "target.upd_ts" -> s"${current_timestamp()}",
    "target.upd_usr" -> "'testUser'"
  )

  println("Call upsert ...")
  upsert(
    tempDir.toString,
    sourceDf,
    sourceExcludeCols = List("crt_ts", "crt_usr"),
    mergeOnCols = List("id"),
    partitionPruneCols = List(),
    customUpdateExpr = updateMap)
  println("Complete upsert")

  val expectedDf = createDataframeFromCollection(TargetSchema, Seq(
    Row(1, "Alice-update", Timestamp.valueOf("2023-01-02 00:00:00"), "user1"), //update
    Row(2, "Bob", Timestamp.valueOf("2023-01-01 00:00:00"), "user2"), //unchanged
    Row(3, "Charlie-insert", Timestamp.valueOf("2023-01-02 00:00:00"), "user2"), //new insert
  ))

  println("Show target table after updates")
  val actualDf = spark.read.format("delta").load(tempDir.toString)
  actualDf.show()
  actualDf.printSchema()
  assertSmallDatasetEquality(actualDf, expectedDf)


  def createTargetDf(): DataFrame = {
    createDataframeFromCollection(TargetSchema, Seq(
      Row(1, "Alice", Timestamp.valueOf("2023-01-01 00:00:00"), "user1"),
      Row(2, "Bob", Timestamp.valueOf("2023-01-01 00:00:00"), "user2"),
    ))
  }

  def createSourceDf(): DataFrame = {
    val schema = StructType(Seq(
      StructField("id", IntegerType, nullable = false),
      StructField("name", StringType, nullable = false),
      StructField("crt_ts", TimestampType, nullable = true),
      StructField("crt_usr", StringType, nullable = true)
    ))

    val data = Seq(
      Row(1, "Alice-update", Timestamp.valueOf("2023-01-02 00:00:00"), "user1"), //update
      Row(3, "Charlie-insert", Timestamp.valueOf("2023-01-02 00:00:00"), "user2"), //insert
    )

    createDataframeFromCollection(schema, data)
  }

  private def createDataframeFromCollection(schema: StructType, data: Seq[Row]) = {
    getSession.createDataFrame(getSession.sparkContext.parallelize(data), schema)
  }

  private def getSession = {
    SparkSession.builder()
      .appName("DeltaMergeSparkApp")
      .config("spark.driver.bindAddress", "127.0.0.1")
      .master("local[2]")
      .getOrCreate()
  }

  def upsert(targetPath: String, sourceDf: DataFrame, sourceExcludeCols: List[String] = List(),
             mergeOnCols: List[String], partitionPruneCols: List[String] = List(),
             customUpdateExpr: Map[String, String] = Map()): Unit = {
    println(s"targetPath: $targetPath, sourceExcludeCols: $sourceExcludeCols, mergeOnCols: $mergeOnCols")
    val targetTable = DeltaTable.forPath(SynapseSpark.getActiveSession, targetPath)
    val mergeCols = sourceDf.columns.filter(col => !sourceExcludeCols.contains(col))

    val updateExpr = mergeCols.map(colName => (s"target.$colName", s"source.$colName")).toMap

    val updateMapExpr = updateExpr ++ customUpdateExpr
    val insertExpr: mutable.Map[String, String] = mutable.Map[String, String]() ++= updateMapExpr

    println(s"updateExpr: ${updateMapExpr}")
    println(s"insertExpr: ${insertExpr}")

    val mergeCondition = mergeOnCols.map(col => s"target.$col = source.$col").mkString(" AND ")
    println(s"mergeCondition: $mergeCondition")

    targetTable.as("target")
      .merge(sourceDf.as("source"), mergeCondition)
      .whenMatched
      .updateExpr(updateMapExpr)
      .whenNotMatched
      .insertExpr(insertExpr)
      .execute()
  }
}

Error: Mismatch is in the first two columns.

Exception in thread "main" com.github.mrpowers.spark.fast.tests.DatasetSchemaMismatch: 
Actual Schema Field | Expected Schema Field
StructField(id,IntegerType,true) | StructField(id,IntegerType,false)
StructField(name,StringType,true) | StructField(name,StringType,false)
StructField(upd_ts,TimestampType,true) | StructField(upd_ts,TimestampType,true)
StructField(upd_usr,StringType,true) | StructField(upd_usr,StringType,true)

Log

Spark version: 3.2.2
Delta version: 1.2.1

Source dataframe (updates): 
+---+--------------+-------------------+-------+
| id|          name|             crt_ts|crt_usr|
+---+--------------+-------------------+-------+
|  1|  Alice-update|2023-01-02 00:00:00|  user1|
|  3|Charlie-insert|2023-01-02 00:00:00|  user2|
+---+--------------+-------------------+-------+

root
 |-- id: integer (nullable = false)
 |-- name: string (nullable = false)
 |-- crt_ts: timestamp (nullable = true)
 |-- crt_usr: string (nullable = true)

Target table: 
Create temp dir:  target/upsertHappyPath
Schema before save: 
root
 |-- id: integer (nullable = false)
 |-- name: string (nullable = false)
 |-- upd_ts: timestamp (nullable = true)
 |-- upd_usr: string (nullable = true)

Create Target table at target/upsertHappyPath
Schema after save: 
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- upd_ts: timestamp (nullable = true)
 |-- upd_usr: string (nullable = true)

Call upsert ...
targetPath: target/upsertHappyPath, sourceExcludeCols: List(crt_ts, crt_usr), mergeOnCols: List(id)
updateExpr: Map(target.id -> source.id, target.name -> source.name, target.upd_ts -> current_timestamp(), target.upd_usr -> 'testUser')
insertExpr: Map(target.name -> source.name, target.upd_ts -> current_timestamp(), target.id -> source.id, target.upd_usr -> 'testUser')
mergeCondition: target.id = source.id

Complete upsert
Show target table after updates

+---+--------------+--------------------+--------+
| id|          name|              upd_ts| upd_usr|
+---+--------------+--------------------+--------+
|  1|  Alice-update|2023-06-20 12:14:...|testUser|
|  3|Charlie-insert|2023-06-20 12:14:...|testUser|
|  2|           Bob| 2023-01-01 00:00:00|   user2|
+---+--------------+--------------------+--------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- upd_ts: timestamp (nullable = true)
 |-- upd_usr: string (nullable = true)

Exception in thread "main" com.github.mrpowers.spark.fast.tests.DatasetSchemaMismatch: 
Actual Schema Field | Expected Schema Field
StructField(id,IntegerType,true) | StructField(id,IntegerType,false)
StructField(name,StringType,true) | StructField(name,StringType,false)
StructField(upd_ts,TimestampType,true) | StructField(upd_ts,TimestampType,true)
StructField(upd_usr,StringType,true) | StructField(upd_usr,StringType,true)
    at com.github.mrpowers.spark.fast.tests.DatasetComparer.assertSmallDatasetEquality(DatasetComparer.scala:87)

Spark Delta schema mismatch after save

0 Answers0