0
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, DateType

sp = SparkSession.builder.config("spark.jars",
                                    os.getcwd() + "/jars/spark-sql-kafka-0-10_2.12-3.4.1.jar" + "," + os.getcwd() + "/jars/kafka-clients-3.3.2.jar").appName("kafka_project").getOrCreate()

schema = StructType([StructField("Region", StringType(), True),
                     StructField("States", StringType(), True),
                     StructField("Max.Demand Met during the day(MW)", FloatType(), True),
                     StructField("Shortage during maximum Demand(MW)", FloatType(), True),
                     StructField("Energy Met (MU)", FloatType(), True),
                     StructField("date", DateType(), True)
                     ])
df = sp.readStream.format("csv").schema(schema).option("header", "true").load(  "Daily_Power_Gen_States_march_23.csv")

df.selectExpr("CAST(States as String) as value") \
    .writeStream \
    .format("kafka") \
    .outputMode("append") \
    .option("kafka.bootstrap.servers", "confluent_endpoint:9092") \
    .option("checkpointLocation", "file:///home/ahmad/PycharmProjects/Kafka/checkpoints") \
    .option("topic", "topic_0") \
    .start() \
    .awaitTermination()

I am completely new to this and trying to learn kafka and pyspark streaming but when I run the above code it gives me error as:

Traceback (most recent call last):
  File "/home/ahmad/PycharmProjects/Kafka/main.py", line 32, in <module>
    .awaitTermination()
  File "/home/ahmad/.local/lib/python3.10/site-packages/pyspark/sql/streaming/query.py", line 201, in awaitTermination
    return self._jsq.awaitTermination()
  File "/home/ahmad/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1322, in __call__
    return_value = get_return_value(
  File "/home/ahmad/.local/lib/python3.10/site-packages/pyspark/errors/exceptions/captured.py", line 175, in deco
    raise converted from None
pyspark.errors.exceptions.captured.StreamingQueryException: [STREAM_FAILED] Query [id = f97d825a-ed65-464a-89f2-807e8ff2c691, runId = 587e862c-6ad1-4a91-af55-86bffde5f1f9] terminated with exception: Option 'basePath' must be a directory

When removing the ".awaitTermination()" function from the code the error disappears but no output is generated on Kafka. It would be really appreciable if anyone guides me through this.

ARKHAN
  • 401
  • 2
  • 5
  • I wouldn't recommend using Spark for this. Just use native python libraries. More specifically, your checkpoint location should be shared filesystem, not local disk... Also, remove your console writer – OneCricketeer Aug 06 '23 at 14:41

0 Answers0