I am trying to read a file in an S3 bucket using Spark through Databricks Connect.
This is the code that I am using,
from pyspark import SparkConf
from pyspark.sql import SparkSession
conf = SparkConf()
conf.set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.0')
conf.set('spark.hadoop.fs.s3a.access.key', access_key)
conf.set('spark.hadoop.fs.s3a.secret.key', secret_access_key)
spark = SparkSession.builder.config(conf=conf).getOrCreate()
df = spark.read.format("csv").option("header",True).load('s3a://container/path/to/file.csv')
df.show()
This works completely fine when I execute it using a Docker container that I spin up, however, it fails with Databricks Connect with the following error,
pyspark.dbutils.ExecutionError: An error occurred while calling o48.ls.
: com.databricks.service.SparkServiceRemoteException: java.nio.file.AccessDeniedException: getFileStatus on com.amazonaws.services.s3.model.AmazonS3Exception: Forbidden; request: HEAD Forbidden
at shaded.databricks.org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:244)
at shaded.databricks.org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:155)
at shaded.databricks.org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:2870)
at shaded.databricks.org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:2840)
at shaded.databricks.org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:2779)
at shaded.databricks.org.apache.hadoop.fs.s3a.S3AFileSystem.innerListStatus(S3AFileSystem.java:2449)
at shaded.databricks.org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$listStatus$11(S3AFileSystem.java:2428)
at shaded.databricks.org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:118)
at shaded.databricks.org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:112)
at shaded.databricks.org.apache.hadoop.fs.s3a.S3AFileSystem.listStatus(S3AFileSystem.java:2428)
at com.databricks.service.SparkServiceImpl$.$anonfun$fileSystemOperation0$2(SparkServiceImpl.scala:617)
at com.databricks.service.SparkServiceImpl$.withFileSystemExceptionHandler(SparkServiceImpl.scala:647)
at com.databricks.service.SparkServiceImpl$.fileSystemOperation0(SparkServiceImpl.scala:617)
at com.databricks.service.SparkServiceImpl$.$anonfun$fileSystemOperation$1(SparkServiceImpl.scala:184)
at com.databricks.logging.UsageLogging.$anonfun$recordOperation$4(UsageLogging.scala:431)
at com.databricks.logging.UsageLogging.$anonfun$withAttributionContext$1(UsageLogging.scala:239)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
at com.databricks.logging.UsageLogging.withAttributionContext(UsageLogging.scala:234)
at com.databricks.logging.UsageLogging.withAttributionContext$(UsageLogging.scala:231)
at com.databricks.spark.util.PublicDBLogging.withAttributionContext(DatabricksSparkUsageLogger.scala:19)
at com.databricks.logging.UsageLogging.withAttributionTags(UsageLogging.scala:276)
at com.databricks.logging.UsageLogging.withAttributionTags$(UsageLogging.scala:269)
at com.databricks.spark.util.PublicDBLogging.withAttributionTags(DatabricksSparkUsageLogger.scala:19)
at com.databricks.logging.UsageLogging.recordOperation(UsageLogging.scala:412)
at com.databricks.logging.UsageLogging.recordOperation$(UsageLogging.scala:338)
at com.databricks.spark.util.PublicDBLogging.recordOperation(DatabricksSparkUsageLogger.scala:19)
at com.databricks.spark.util.PublicDBLogging.recordOperation0(DatabricksSparkUsageLogger.scala:56)
at com.databricks.spark.util.DatabricksSparkUsageLogger.recordOperation(DatabricksSparkUsageLogger.scala:131)
at com.databricks.spark.util.UsageLogger.recordOperation(UsageLogger.scala:71)
at com.databricks.spark.util.UsageLogger.recordOperation$(UsageLogger.scala:58)
at com.databricks.spark.util.DatabricksSparkUsageLogger.recordOperation(DatabricksSparkUsageLogger.scala:85)
at com.databricks.spark.util.UsageLogging.recordOperation(UsageLogger.scala:401)
at com.databricks.spark.util.UsageLogging.recordOperation$(UsageLogger.scala:380)
at com.databricks.service.SparkServiceImpl$.recordOperation(SparkServiceImpl.scala:92)
at com.databricks.service.SparkServiceImpl$.fileSystemOperation(SparkServiceImpl.scala:184)
at com.databricks.service.SparkServiceRPCHandler.execute0(SparkServiceRPCHandler.scala:663)
at com.databricks.service.SparkServiceRPCHandler.$anonfun$executeRPC0$1(SparkServiceRPCHandler.scala:451)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
at com.databricks.service.SparkServiceRPCHandler.executeRPC0(SparkServiceRPCHandler.scala:351)
at com.databricks.service.SparkServiceRPCHandler$$anon$2.call(SparkServiceRPCHandler.scala:302)
at com.databricks.service.SparkServiceRPCHandler$$anon$2.call(SparkServiceRPCHandler.scala:288)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at com.databricks.service.SparkServiceRPCHandler.$anonfun$executeRPC$1(SparkServiceRPCHandler.scala:338)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)
at com.databricks.service.SparkServiceRPCHandler.executeRPC(SparkServiceRPCHandler.scala:315)
at com.databricks.service.SparkServiceRPCServlet.doPost(SparkServiceRPCServer.scala:152)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:707)
at javax.servlet.http.HttpServlet.service(HttpServlet.java:790)
at org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:873)
at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:542)
at org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:205)
at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:480)
at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:144)
at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:132)
at org.eclipse.jetty.server.Server.handle(Server.java:505)
at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:370)
at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:267)
at org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:305)
at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:103)
at org.eclipse.jetty.io.ChannelEndPoint$2.run(ChannelEndPoint.java:117)
at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:333)
at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:310)
at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:168)
at org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:126)
at org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:366)
at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:698)
at org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:804)
at java.lang.Thread.run(Thread.java:750)
Caused by: com.amazonaws.services.s3.model.AmazonS3Exception: Forbidden; request: HEAD
Note: I've scrubbed some information related to the path of my file in the above error thread.