We run our ETL jobs on Databricks Notebooks which we will execute via Azure Data Factory. We use the Delta table format, and register all tables in Databricks. We use databricks runtime 7.3 with scala 2.12 and spark 3.0.1. In our jobs we first DROP the Table and delete the associated delta files which are stored on an azure storage account like so:
DROP TABLE IF EXISTS db.TableName
dbutils.fs.rm(pathToTable, recurse=True)
After table deletion we create the table like so:
CREATE TABLE IF NOT EXISTS db.TableName
(
col1 STRING
col2 INT
)
USING DELTA
LOCATION pathToTable
During the CREATE TABLE statement we run into the following error (I removed filepath and runId due to data protection):
*java.util.concurrent.ExecutionException: org.apache.spark.SparkException: Exception thrown in awaitResult*
Py4JJavaError Traceback (most recent call last)
<command-537730371905906> in <module>
25 """
26
---> 27 spark.sql(createTblCmd)
/databricks/spark/python/pyspark/sql/session.py in sql(self, sqlQuery)
707 [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]
708 """
--> 709 return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)
710
711 @since(2.0)
/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
1306
1307 for temp_arg in temp_args:
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
125 def deco(*a, **kw):
126 try:
--> 127 return f(*a, **kw)
128 except py4j.protocol.Py4JJavaError as e:
129 converted = convert_exception(e.java_exception)
/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o240.sql.
: java.util.concurrent.ExecutionException: org.apache.spark.SparkException: Exception thrown in awaitResult:
at com.google.common.util.concurrent.AbstractFuture$Sync.getValue(AbstractFuture.java:299)
at com.google.common.util.concurrent.AbstractFuture$Sync.get(AbstractFuture.java:286)
at com.google.common.util.concurrent.AbstractFuture.get(AbstractFuture.java:116)
at com.google.common.util.concurrent.Uninterruptibles.getUninterruptibly(Uninterruptibles.java:135)
at com.google.common.cache.LocalCache$Segment.getAndRecordStats(LocalCache.java:2344)
at com.google.common.cache.LocalCache$Segment.loadSync(LocalCache.java:2316)
at com.google.common.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2278)
at com.google.common.cache.LocalCache$Segment.get(LocalCache.java:2193)
at com.google.common.cache.LocalCache.get(LocalCache.java:3932)
at com.google.common.cache.LocalCache$LocalManualCache.get(LocalCache.java:4721)
at com.databricks.sql.transaction.tahoe.DeltaLog$.apply(DeltaLog.scala:486)
at com.databricks.sql.transaction.tahoe.DeltaLog$.forTable(DeltaLog.scala:421)
at com.databricks.sql.transaction.tahoe.commands.CreateDeltaTableCommand.run(CreateDeltaTableCommand.scala:100)
at com.databricks.sql.transaction.tahoe.catalog.DeltaCatalog.com$databricks$sql$transaction$tahoe$catalog$DeltaCatalog$$createDeltaTable(DeltaCatalog.scala:141)
at com.databricks.sql.transaction.tahoe.catalog.DeltaCatalog.createTable(DeltaCatalog.scala:186)
at org.apache.spark.sql.execution.datasources.v2.CreateTableExec.run(CreateTableExec.scala:41)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:39)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:39)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:45)
at org.apache.spark.sql.Dataset.$anonfun$logicalPlan$1(Dataset.scala:234)
at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3709)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:116)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:249)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:101)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:845)
at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:199)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3707)
at org.apache.spark.sql.Dataset.<init>(Dataset.scala:234)
at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:104)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:845)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:101)
at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:680)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:845)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:675)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:295)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Exception thrown in awaitResult:
at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:431)
at org.apache.spark.util.ThreadUtils$.parallelMap(ThreadUtils.scala:400)
at com.databricks.sql.transaction.tahoe.DeltaFileProvider.$anonfun$localDeltaFiles$1(DeltaFileProvider.scala:86)
at scala.Option.map(Option.scala:230)
at com.databricks.sql.transaction.tahoe.DeltaFileProvider.com$databricks$sql$transaction$tahoe$DeltaFileProvider$$localDeltaFiles(DeltaFileProvider.scala:80)
at com.databricks.sql.transaction.tahoe.DeltaFileProvider.com$databricks$sql$transaction$tahoe$DeltaFileProvider$$localDeltaFiles$(DeltaFileProvider.scala:80)
at com.databricks.sql.transaction.tahoe.SnapshotEdge.com$databricks$sql$transaction$tahoe$DeltaFileProvider$$localDeltaFiles$lzycompute(SnapshotEdge.scala:51)
at com.databricks.sql.transaction.tahoe.SnapshotEdge.com$databricks$sql$transaction$tahoe$DeltaFileProvider$$localDeltaFiles(SnapshotEdge.scala:51)
at com.databricks.sql.transaction.tahoe.DeltaFileProvider.loadDeltaFiles(DeltaFileProvider.scala:123)
at com.databricks.sql.transaction.tahoe.DeltaFileProvider.loadDeltaFiles$(DeltaFileProvider.scala:121)
at com.databricks.sql.transaction.tahoe.SnapshotEdge.loadDeltaFiles(SnapshotEdge.scala:51)
at com.databricks.sql.transaction.tahoe.SnapshotEdge.$anonfun$x$4$1(SnapshotEdge.scala:99)
at com.databricks.sql.acl.CheckPermissions$.trusted(CheckPermissions.scala:1128)
at com.databricks.sql.transaction.tahoe.SnapshotEdge.x$4$lzycompute(SnapshotEdge.scala:90)
at com.databricks.sql.transaction.tahoe.SnapshotEdge.x$4(SnapshotEdge.scala:90)
at com.databricks.sql.transaction.tahoe.SnapshotEdge._metadata$lzycompute(SnapshotEdge.scala:90)
at com.databricks.sql.transaction.tahoe.SnapshotEdge._metadata(SnapshotEdge.scala:90)
at com.databricks.sql.transaction.tahoe.SnapshotEdge.metadata(SnapshotEdge.scala:143)
at com.databricks.sql.transaction.tahoe.SnapshotEdge.toString(SnapshotEdge.scala:237)
at java.lang.String.valueOf(String.java:2994)
at java.lang.StringBuilder.append(StringBuilder.java:131)
at com.databricks.sql.transaction.tahoe.Snapshot.$anonfun$new$1(Snapshot.scala:297)
at com.databricks.sql.transaction.tahoe.Snapshot.$anonfun$logInfo$1(Snapshot.scala:274)
at org.apache.spark.internal.Logging.logInfo(Logging.scala:57)
at org.apache.spark.internal.Logging.logInfo$(Logging.scala:56)
at com.databricks.sql.transaction.tahoe.Snapshot.logInfo(Snapshot.scala:274)
at com.databricks.sql.transaction.tahoe.Snapshot.<init>(Snapshot.scala:297)
at com.databricks.sql.transaction.tahoe.SnapshotEdge.<init>(SnapshotEdge.scala:67)
at com.databricks.sql.transaction.tahoe.SnapshotManagementEdge.createSnapshot(SnapshotManagementEdge.scala:57)
at com.databricks.sql.transaction.tahoe.SnapshotManagementEdge.createSnapshot$(SnapshotManagementEdge.scala:37)
at com.databricks.sql.transaction.tahoe.DeltaLog.createSnapshot(DeltaLog.scala:65)
at com.databricks.sql.transaction.tahoe.SnapshotManagement.getSnapshotAtInit(SnapshotManagement.scala:196)
at com.databricks.sql.transaction.tahoe.SnapshotManagement.getSnapshotAtInit$(SnapshotManagement.scala:187)
at com.databricks.sql.transaction.tahoe.DeltaLog.getSnapshotAtInit(DeltaLog.scala:65)
at com.databricks.sql.transaction.tahoe.SnapshotManagement.$init$(SnapshotManagement.scala:50)
at com.databricks.sql.transaction.tahoe.DeltaLog.<init>(DeltaLog.scala:69)
at com.databricks.sql.transaction.tahoe.DeltaLog$$anon$3.$anonfun$call$2(DeltaLog.scala:490)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.allowInvokingTransformsInAnalyzer(AnalysisHelper.scala:216)
at com.databricks.sql.transaction.tahoe.DeltaLog$$anon$3.$anonfun$call$1(DeltaLog.scala:490)
at com.databricks.logging.UsageLogging.$anonfun$recordOperation$4(UsageLogging.scala:431)
at com.databricks.logging.UsageLogging.$anonfun$withAttributionContext$1(UsageLogging.scala:239)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
at com.databricks.logging.UsageLogging.withAttributionContext(UsageLogging.scala:234)
at com.databricks.logging.UsageLogging.withAttributionContext$(UsageLogging.scala:231)
at com.databricks.spark.util.PublicDBLogging.withAttributionContext(DatabricksSparkUsageLogger.scala:19)
at com.databricks.logging.UsageLogging.withAttributionTags(UsageLogging.scala:276)
at com.databricks.logging.UsageLogging.withAttributionTags$(UsageLogging.scala:269)
at com.databricks.spark.util.PublicDBLogging.withAttributionTags(DatabricksSparkUsageLogger.scala:19)
at com.databricks.logging.UsageLogging.recordOperation(UsageLogging.scala:412)
at com.databricks.logging.UsageLogging.recordOperation$(UsageLogging.scala:338)
at com.databricks.spark.util.PublicDBLogging.recordOperation(DatabricksSparkUsageLogger.scala:19)
at com.databricks.spark.util.PublicDBLogging.recordOperation0(DatabricksSparkUsageLogger.scala:56)
at com.databricks.spark.util.DatabricksSparkUsageLogger.recordOperation(DatabricksSparkUsageLogger.scala:129)
at com.databricks.spark.util.UsageLogger.recordOperation(UsageLogger.scala:71)
at com.databricks.spark.util.UsageLogger.recordOperation$(UsageLogger.scala:58)
at com.databricks.spark.util.DatabricksSparkUsageLogger.recordOperation(DatabricksSparkUsageLogger.scala:85)
at com.databricks.spark.util.UsageLogging.recordOperation(UsageLogger.scala:401)
at com.databricks.spark.util.UsageLogging.recordOperation$(UsageLogger.scala:380)
at com.databricks.sql.transaction.tahoe.DeltaLog$.recordOperation(DeltaLog.scala:385)
at com.databricks.sql.transaction.tahoe.metering.DeltaLogging.recordDeltaOperation(DeltaLogging.scala:108)
at com.databricks.sql.transaction.tahoe.metering.DeltaLogging.recordDeltaOperation$(DeltaLogging.scala:94)
at com.databricks.sql.transaction.tahoe.DeltaLog$.recordDeltaOperation(DeltaLog.scala:385)
at com.databricks.sql.transaction.tahoe.DeltaLog$$anon$3.call(DeltaLog.scala:489)
at com.databricks.sql.transaction.tahoe.DeltaLog$$anon$3.call(DeltaLog.scala:486)
at com.google.common.cache.LocalCache$LocalManualCache$1.load(LocalCache.java:4724)
at com.google.common.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3522)
at com.google.common.cache.LocalCache$Segment.loadSync(LocalCache.java:2315)
... 40 more
Caused by: java.io.FileNotFoundException: Operation failed: "The specified path does not exist.", 404, GET, https://<storageaccountname>.dfs.core.windows.net/<filepath>/_delta_log/00000000000000000001.json?timeout=90, PathNotFound, "The specified path does not exist. RequestId:xxx Time:xxx"
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.services.AbfsInputStream.readRemote(AbfsInputStream.java:321)
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.services.AbfsInputStream.readInternal(AbfsInputStream.java:273)
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.services.AbfsInputStream.readOneBlock(AbfsInputStream.java:200)
at shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.services.AbfsInputStream.read(AbfsInputStream.java:144)
at java.io.DataInputStream.read(DataInputStream.java:149)
at sun.nio.cs.StreamDecoder.readBytes(StreamDecoder.java:284)
at sun.nio.cs.StreamDecoder.implRead(StreamDecoder.java:326)
at sun.nio.cs.StreamDecoder.read(StreamDecoder.java:178)
at java.io.InputStreamReader.read(InputStreamReader.java:184)
at java.io.BufferedReader.fill(BufferedReader.java:161)
at java.io.BufferedReader.readLine(BufferedReader.java:324)
at java.io.BufferedReader.readLine(BufferedReader.java:389)
at org.apache.commons.io.IOUtils.readLines(IOUtils.java:1030)
at com.databricks.tahoe.store.DefaultLogStore.read(DefaultLogStore.scala:43)
at com.databricks.tahoe.store.DelegatingLogStore.read(DelegatingLogStore.scala:92)
at com.databricks.sql.transaction.tahoe.DeltaFileProvider.$anonfun$localDeltaFiles$2(DeltaFileProvider.scala:92)
at org.apache.spark.util.ThreadUtils$.$anonfun$parallelMap$2(ThreadUtils.scala:398)
at scala.concurrent.Future$.$anonfun$apply$1(Future.scala:659)
at scala.util.Success.$anonfun$map$1(Try.scala:255)
at scala.util.Success.map(Try.scala:213)
at scala.concurrent.Future.$anonfun$map$1(Future.scala:292)
at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:33)
at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:33)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64)
at org.apache.spark.util.threads.SparkThreadLocalCapturingRunnable.$anonfun$run$1(SparkThreadLocalForwardingThreadPoolExecutor.scala:104)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.util.threads.SparkThreadLocalCapturingHelper.runWithCaptured(SparkThreadLocalForwardingThreadPoolExecutor.scala:68)
at org.apache.spark.util.threads.SparkThreadLocalCapturingHelper.runWithCaptured$(SparkThreadLocalForwardingThreadPoolExecutor.scala:54)
at org.apache.spark.util.threads.SparkThreadLocalCapturingRunnable.runWithCaptured(SparkThreadLocalForwardingThreadPoolExecutor.scala:101)
at org.apache.spark.util.threads.SparkThreadLocalCapturingRunnable.run(SparkThreadLocalForwardingThreadPoolExecutor.scala:104)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
It seems like spark is looking for the delta file, but can not find one, since we deleted all files before we execute the create table statement. The question is why is spark looking for an existing delta log file. The other thing which is strange that this error occurs only sometimes, which makes it hard to debug.