0

I'm trying to figure out how to create good concurrency-proof delta table design.

To simulate that i've created following code snippet:

import tempfile
import threading
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('test').getOrCreate()

SCHEMA = StructType(
    [
        StructField("uuid", StringType(), nullable=False),
        StructField("column_1", StringType(), nullable=False),
        StructField("column_2", StringType(), nullable=False),
        StructField("product_name", StringType(), nullable=False),
        StructField("category_id", IntegerType(), nullable=False),
        StructField("checksum", StringType(), nullable=False),
    ]
)

def merge_df(df, category_id, target_delta_table):
    join_condition = f"target.category_id = '{category_id}' AND target.checksum == source.checksum"

    delete_condition = f"target.category_id = '{category_id}"

    update_condition = "target.uuid <> source.uuid"
    update_set = {"column_1": "source.column_1", "column_2": "source.column_2"}

    merge_statement = (
        target_delta_table.alias("target")
        .merge(df.alias("source"), join_condition)
        .whenNotMatchedInsertAll()
        .whenNotMatchedBySourceDelete(condition=delete_condition)
        .whenMatchedUpdate(condition=update_condition, set=update_set)
    )
    merge_statement.execute()


data = [
    ("001", "100", "1", "Product A", 1, "123456789"),
    ("002", "200", "1", "Product B", 2, "987654321"),
    ("003", "300", "1", "Product C", 1, "246813579"),
]
initial_df = spark.createDataFrame(data, schema)

temp_dir = tempfile.mkdtemp()

initial_df.write.format("delta").option(
            "delta.enableChangeDataFeed", True
        ).partitionBy('category_id').save(temp_dir)

target_table = DeltaTable.forPath(spark, temp_dir)

new_data_1 = [("005", "500", "5", "Product G", 1, "aaabbbccc")]
category_id_1 = 1
df_1 = spark.createDataFrame(new_data_1, schema)

new_data_2 = [("007", "700", "7", "Product O", 2, "111555")]
category_id_2 = 2
df_2 = spark.createDataFrame(new_data_2, schema)

threads = []
t1 = threading.Thread(target=merge_df, args=(df_1, category_id_1, target_table, schema))
t1.start()
threads.append(t1)

t2=threading.Thread(target=merge_df, args=(df_2, category_id_2, target_table, schema))
t2.start()
threads.append(t2)

for thread in threads:
    thread.join()


I've tried to follow the advice from https://docs.delta.io/latest/concurrency-control.html#concurrentappendexception but i keep getting the delta.exceptions.ConcurrentAppendException. Why delta lake table does not recognize that those parallel marge statements are poining to different partitions?

Szymson
  • 63
  • 7

0 Answers0