0

I am getting the below error when trying to run the merge command. Error does not have enough information about what's the cause of the error. I don't see any failed tasks in the Spark UI. Any suggestions on how to debug this issue?

Command I am trying to run

import time
from delta.tables import *
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col, row_number, lit
import boto3


spark = SparkSession.builder.getOrCreate()

df = spark.sql(f"SELECT * FROM raw_snapshots.deribit_optionleg")


target_df = DeltaTable.forPath(spark, 's3://datalake/raw/public.dbt')

target_df.alias("t1").merge(df.alias("s1"), "t1.id = s1.id") \
   .whenMatchedUpdate(set =
      {
        "t1.id": "s1.id",
      }
    ) \
  .execute()

Error:

---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
<command-1086065644019466> in <module>
     13 # print(target_df.rdd.getNumPartitions())
     14 
---> 15 target_df.alias("t1").merge(df.alias("s1"), "t1.id = s1.id") \
     16    .whenMatchedUpdate(set =
     17       {

/databricks/spark/python/lib/py4j-0.10.9.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    324             value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
    325             if answer[1] == REFERENCE_TYPE:
--> 326                 raise Py4JJavaError(
    327                     "An error occurred while calling {0}{1}{2}.\n".
    328                     format(target_id, ".", name), value)

Py4JJavaError: An error occurred while calling o490.execute.
: org.apache.spark.util.SparkFatalException
    at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.$anonfun$relationFuture$1(BroadcastExchangeExec.scala:212)
    at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:852)
    at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withThreadLocalCaptured$4(SQLExecution.scala:395)
    at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
    at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withThreadLocalCaptured$3(SQLExecution.scala:395)
    at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
    at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withThreadLocalCaptured$2(SQLExecution.scala:394)
    at com.databricks.sql.transaction.tahoe.OptimisticTransaction$.withActive(OptimisticTransaction.scala:133)
    at org.apache.spark.sql.execution.SQLExecution$.withOptimisticTransaction(SQLExecution.scala:410)
    at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withThreadLocalCaptured$1(SQLExecution.scala:393)
    at java.util.concurrent.CompletableFuture$AsyncSupply.run(CompletableFuture.java:1604)
    at org.apache.spark.util.threads.SparkThreadLocalCapturingRunnable.$anonfun$run$1(SparkThreadLocalForwardingThreadPoolExecutor.scala:104)
    at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
    at org.apache.spark.util.threads.SparkThreadLocalCapturingHelper.runWithCaptured(SparkThreadLocalForwardingThreadPoolExecutor.scala:68)
    at org.apache.spark.util.threads.SparkThreadLocalCapturingHelper.runWithCaptured$(SparkThreadLocalForwardingThreadPoolExecutor.scala:54)
    at org.apache.spark.util.threads.SparkThreadLocalCapturingRunnable.runWithCaptured(SparkThreadLocalForwardingThreadPoolExecutor.scala:101)
    at org.apache.spark.util.threads.SparkThreadLocalCapturingRunnable.run(SparkThreadLocalForwardingThreadPoolExecutor.scala:104)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)
VE88
  • 125
  • 1
  • 5

0 Answers0