import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, DateType
sp = SparkSession.builder.config("spark.jars",
os.getcwd() + "/jars/spark-sql-kafka-0-10_2.12-3.4.1.jar" + "," + os.getcwd() + "/jars/kafka-clients-3.3.2.jar").appName("kafka_project").getOrCreate()
schema = StructType([StructField("Region", StringType(), True),
StructField("States", StringType(), True),
StructField("Max.Demand Met during the day(MW)", FloatType(), True),
StructField("Shortage during maximum Demand(MW)", FloatType(), True),
StructField("Energy Met (MU)", FloatType(), True),
StructField("date", DateType(), True)
])
df = sp.readStream.format("csv").schema(schema).option("header", "true").load( "Daily_Power_Gen_States_march_23.csv")
df.selectExpr("CAST(States as String) as value") \
.writeStream \
.format("kafka") \
.outputMode("append") \
.option("kafka.bootstrap.servers", "confluent_endpoint:9092") \
.option("checkpointLocation", "file:///home/ahmad/PycharmProjects/Kafka/checkpoints") \
.option("topic", "topic_0") \
.start() \
.awaitTermination()
I am completely new to this and trying to learn kafka and pyspark streaming but when I run the above code it gives me error as:
Traceback (most recent call last):
File "/home/ahmad/PycharmProjects/Kafka/main.py", line 32, in <module>
.awaitTermination()
File "/home/ahmad/.local/lib/python3.10/site-packages/pyspark/sql/streaming/query.py", line 201, in awaitTermination
return self._jsq.awaitTermination()
File "/home/ahmad/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1322, in __call__
return_value = get_return_value(
File "/home/ahmad/.local/lib/python3.10/site-packages/pyspark/errors/exceptions/captured.py", line 175, in deco
raise converted from None
pyspark.errors.exceptions.captured.StreamingQueryException: [STREAM_FAILED] Query [id = f97d825a-ed65-464a-89f2-807e8ff2c691, runId = 587e862c-6ad1-4a91-af55-86bffde5f1f9] terminated with exception: Option 'basePath' must be a directory
When removing the ".awaitTermination()" function from the code the error disappears but no output is generated on Kafka. It would be really appreciable if anyone guides me through this.