I am trying to write the stream from kafka using pyspark. It gives me a warning like this
WARN NetworkClient: [Consumer clientId=consumer-spark-kafka-source-a422a51e-b0ef-45cc-bd5c-c327b8881359-1275625627-driver-0-1, groupId=spark-kafka-source-a422a51e-b0ef-45cc-bd5c-c327b8881359-1275625627-driver-0] Connection to node -1 (localhost/127.0.0.1:9092) could not be established. Broker may not be available. WARN NetworkClient: [Consumer clientId=consumer-spark-kafka-source-a422a51e-b0ef-45cc-bd5c-c327b8881359-1275625627-driver-0-1, groupId=spark-kafka-source-a422a51e-b0ef-45cc-bd5c-c327b8881359-1275625627-driver-0] Bootstrap broker localhost:9092 (id: -1 rack: null) disconnected
and so on ...
I have checked my server.properties and zookeeper.properties, it seems everything is fine
here is the python code that I am using
def retrieve_schema(Topic: str) -> str:
# schema registry
schema_registry_server = "localhost"
schema_registry_port = "8081"
schema_registry_url = "http://" + schema_registry_server + ":" + schema_registry_port
response_schema =requests.get('{}/subjects/{}value/versions/latest/schema'.format(schema_registry_url, Topic))
response_schema.raise_for_status()
schema = response_schema.text
return schema
if __name__ == '__main__':
sparkDir = "C:\spark\spark-3.1.3-bin-hadoop2.7"
findspark.init(sparkDir)
spark = SparkSession.builder.appName("Kafka Pyspark Streaming").getOrCreate()
topic_name = "ora-WALLET_TRANSACTIONS"
kafka_bootstrap_server = "localhost:9092"
# create the schema as a json format
mySchema_json = retrieve_schema(topic_name)
sampleDataFrame = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", kafka_bootstrap_server) \
.option("subscribe", topic_name) \
.option("startingOffsets", "earliest") \
.load()
sampleDataFrame.printSchema()
avroValues = sampleDataFrame.select(from_avro(data=col("value"), jsonFormatSchema=mySchema_json).alias("value"))
avroData = avroValues.select("value.TRX_ID", "value.ACCOUNT_NUMBER",
"value.TRANSACTION_AMOUNT", "value.TRANSACTION_DATE")
avroData.printSchema()
print("\nStart writing the stream")
dStream = avroData\
.writeStream\
.queryName("data")\
.format("console")\
.start()
#
dStream.awaitTermination()
EDIT: I switch from avro to json converter and I added to the previous code the StructType schema that matchs the records in topic
mySchema = StructType([
StructField("TRX_ID", StringType(), True),
StructField("ACCOUNT_NUMBER", StringType(), True),
StructField("TRANSACTION_AMOUNT", DoubleType(), True),
StructField("TRANSACTION_CURRENCY", StringType(), True),
StructField("TRANSACTION_DATE", TimestampType(), True),
StructField("TRANSACTION_REFERENCE_ARN", StringType(), True),
StructField("REFERENCE_NUMBER", StringType(), True),
StructField("WALLET_NUMBER", StringType(), True),
StructField("SOURCE_ACCOUNT_NUMBER", StringType(), True),
StructField("DESTINATION", StringType(), True),
StructField("FC_REFERENCE_NUMBER", StringType(), True),
StructField("TRANSACTION_TYPE", StringType(), True),
StructField("PROVIDER", StringType(), True),
StructField("DEST_BANK_CODE", StringType(), True),
StructField("BILL_KEY", StringType(), True),
StructField("GHIPS_REF", StringType(), True),
StructField("USER_CODE", StringType(), True),
StructField("BANK_CODE", StringType(), True)
])
And I used from_json instead of from_avro i get the stream back it has null values