I'm trying to figure out how to create good concurrency-proof delta table design.
To simulate that i've created following code snippet:
import tempfile
import threading
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('test').getOrCreate()
SCHEMA = StructType(
[
StructField("uuid", StringType(), nullable=False),
StructField("column_1", StringType(), nullable=False),
StructField("column_2", StringType(), nullable=False),
StructField("product_name", StringType(), nullable=False),
StructField("category_id", IntegerType(), nullable=False),
StructField("checksum", StringType(), nullable=False),
]
)
def merge_df(df, category_id, target_delta_table):
join_condition = f"target.category_id = '{category_id}' AND target.checksum == source.checksum"
delete_condition = f"target.category_id = '{category_id}"
update_condition = "target.uuid <> source.uuid"
update_set = {"column_1": "source.column_1", "column_2": "source.column_2"}
merge_statement = (
target_delta_table.alias("target")
.merge(df.alias("source"), join_condition)
.whenNotMatchedInsertAll()
.whenNotMatchedBySourceDelete(condition=delete_condition)
.whenMatchedUpdate(condition=update_condition, set=update_set)
)
merge_statement.execute()
data = [
("001", "100", "1", "Product A", 1, "123456789"),
("002", "200", "1", "Product B", 2, "987654321"),
("003", "300", "1", "Product C", 1, "246813579"),
]
initial_df = spark.createDataFrame(data, schema)
temp_dir = tempfile.mkdtemp()
initial_df.write.format("delta").option(
"delta.enableChangeDataFeed", True
).partitionBy('category_id').save(temp_dir)
target_table = DeltaTable.forPath(spark, temp_dir)
new_data_1 = [("005", "500", "5", "Product G", 1, "aaabbbccc")]
category_id_1 = 1
df_1 = spark.createDataFrame(new_data_1, schema)
new_data_2 = [("007", "700", "7", "Product O", 2, "111555")]
category_id_2 = 2
df_2 = spark.createDataFrame(new_data_2, schema)
threads = []
t1 = threading.Thread(target=merge_df, args=(df_1, category_id_1, target_table, schema))
t1.start()
threads.append(t1)
t2=threading.Thread(target=merge_df, args=(df_2, category_id_2, target_table, schema))
t2.start()
threads.append(t2)
for thread in threads:
thread.join()
I've tried to follow the advice from https://docs.delta.io/latest/concurrency-control.html#concurrentappendexception but i keep getting the delta.exceptions.ConcurrentAppendException. Why delta lake table does not recognize that those parallel marge statements are poining to different partitions?