Sharing an alternative solution where config part is taken care at very beginning rather than handling configs later in save
method (to seperate configs from logic).
def save(message: DataFrame):
message.write \
.format("mongo") \
.mode("append") \
.option("database", "db_name") \
.option("collection", "collection_name") \
.save()
pass
spark: SparkSession = SparkSession \
.builder \
.appName("MyApp") \
.config("spark.mongodb.input.uri", "mongodb://localhost:27017") \
.config("spark.mongodb.output.uri", "mongodb://localhost:27017") \
.config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
.master("local") \
.getOrCreate()
df: DataFrame = spark \
.readStream \
.format("socket") \
.option("host", "localhost") \
.option("port", 9999) \
.load()
query: StreamingQuery = df\
.writeStream \
.foreachBatch(save) \
.start()
query.awaitTermination()