This code works only if I make directory="s3://bucket/folder/2022/10/18/4/*"
from pyspark.sql.functions import from_json
from pyspark.streaming import StreamingContext
ssc = StreamingContext(sc, 30)
directory = "s3://bucket/folder/*/*/*/*/*"
stream_data = ssc.textFileStream(directory)
def readMyStream(rdd):
if not rdd.isEmpty():
df = spark.read.option("multiline","true").json(rdd)
print('Started the Process')
print('Selection of Columns')
df = df.select("c1","c2","c3","c4","c5")
df.show()
stream_data.foreachRDD(lambda rdd: readMyStream(rdd))
ssc.start()
ssc.awaitTermination()
In the docs it says it supports POSIX glob pattern. Any help is appreciated. Thank you