I am getting the below error when trying to run the merge command. Error does not have enough information about what's the cause of the error. I don't see any failed tasks in the Spark UI. Any suggestions on how to debug this issue?
Command I am trying to run
import time
from delta.tables import *
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col, row_number, lit
import boto3
spark = SparkSession.builder.getOrCreate()
df = spark.sql(f"SELECT * FROM raw_snapshots.deribit_optionleg")
target_df = DeltaTable.forPath(spark, 's3://datalake/raw/public.dbt')
target_df.alias("t1").merge(df.alias("s1"), "t1.id = s1.id") \
.whenMatchedUpdate(set =
{
"t1.id": "s1.id",
}
) \
.execute()
Error:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<command-1086065644019466> in <module>
13 # print(target_df.rdd.getNumPartitions())
14
---> 15 target_df.alias("t1").merge(df.alias("s1"), "t1.id = s1.id") \
16 .whenMatchedUpdate(set =
17 {
/databricks/spark/python/lib/py4j-0.10.9.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o490.execute.
: org.apache.spark.util.SparkFatalException
at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.$anonfun$relationFuture$1(BroadcastExchangeExec.scala:212)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:852)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withThreadLocalCaptured$4(SQLExecution.scala:395)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withThreadLocalCaptured$3(SQLExecution.scala:395)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withThreadLocalCaptured$2(SQLExecution.scala:394)
at com.databricks.sql.transaction.tahoe.OptimisticTransaction$.withActive(OptimisticTransaction.scala:133)
at org.apache.spark.sql.execution.SQLExecution$.withOptimisticTransaction(SQLExecution.scala:410)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withThreadLocalCaptured$1(SQLExecution.scala:393)
at java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1604)
at org.apache.spark.util.threads.SparkThreadLocalCapturingRunnable.$anonfun$run$1(SparkThreadLocalForwardingThreadPoolExecutor.scala:104)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.util.threads.SparkThreadLocalCapturingHelper.runWithCaptured(SparkThreadLocalForwardingThreadPoolExecutor.scala:68)
at org.apache.spark.util.threads.SparkThreadLocalCapturingHelper.runWithCaptured$(SparkThreadLocalForwardingThreadPoolExecutor.scala:54)
at org.apache.spark.util.threads.SparkThreadLocalCapturingRunnable.runWithCaptured(SparkThreadLocalForwardingThreadPoolExecutor.scala:101)
at org.apache.spark.util.threads.SparkThreadLocalCapturingRunnable.run(SparkThreadLocalForwardingThreadPoolExecutor.scala:104)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)