I want to load files based on the timestamp present in their file name. These files are in a S3 bucket. When I run my script locally, reading from a local folder, it works without issue. When I run it on EC2 trying to access my S3 bucket, the java.io.File.listFiles
returns an empty array.
What am I missing?
import main.scala.util.Util._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import scala.collection.Seq
import org.apache.spark.sql.types.StringType
object Test {
def main(args: Array[String]) {
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val path = "s3://my-bucket/subbucket/"
val now: String = extractDate("2017-01-12").toString
var files = new java.io.File(path)
.listFiles
.filter(p => date_filter(p,now))
.map(path+_.getName)
.toSeq
val df = spark.read.csv(files:_*)
df.show()
}
}