I use Pyspark to readStream from kafka, process and writeStream to delta table.
pyspark 3.2.1
io.delta 1.2.2
hadoop 3.3.0
This code does not produce any results to output delta when deployed in kubernetes or running in databricks.
am I producing no data?
When I run display without writeStream part in databricks then I see the data.
What's happening?
def run(spark, window_duration, watermark_delay):
input_time_col = "timestamp"
keep_original_cols = [input_time_col, "topic"]
raw_message_data = StructType([
StructField("col1", StringType(), True),
StructField("col2", StringType(), True),
StructField("col3", StringType(), True),
StructField("col4", IntegerType(), True),
StructField("col5", IntegerType(), True),
])
return (spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", KAFKA_SERVERS)
.option("subscribe", INPUT_TOPIC)
.option("startingOffsets", STARTING_OFFSETS)
.option("maxOffsetsPerTrigger", MAX_OFFSETS_PER_TRIGGER)
.option("failOnDataLoss", FAIL_ON_DATA_LOSS)
.option("minPartitions", MIN_PARTITIONS)
.load()
.withColumn("tmp", from_json(col("value").cast("string"), raw_message_data))
.select(f"tmp.*", *keep_original_cols)
.withWatermark(input_time_col, watermark_delay)
.groupBy(
window(col(input_time_col), window_duration).alias("period"),
)
.agg(
count("*").alias("query_count")
)
.withColumn("period_start", expr("period.start"))
.withColumn("date", expr("date(period_start)"))
.withColumn("hour", expr("hour(period_start)"))
.withColumn("minute", expr("minute(period_start)"))
.writeStream
.outputMode(OUTPUT_MODE)
# .partitionBy("date", "hour")
.format(OUTPUT_FORMAT)
.option("mergeSchema", "true")
.option("checkpointLocation", CHECKPOINT_LOCATION))
query = run(spark, "2 minutes", "1 minuteS")
query.start(OUTPUT_PATH).awaitTermination()
I see _delta_log but no data appended.
EDIT: constants:
KAFKA_SERVERS = "...my kafka servers..."
INPUT_TOPIC = "some-topic"
MAX_OFFSETS_PER_TRIGGER = "1000"
STARTING_OFFSETS = "latest"
FAIL_ON_DATA_LOSS = "false"
MIN_PARTITIONS = "288"
WINDOW_DURATION = "2 minutes"
WATERMARK_DELAY = "30 seconds"
OUTPUT_FORMAT = "delta"
OUTPUT_MODE = "append"
CHECKPOINT_LOCATION = "wasbs://...someCheckpointLocation"
OUTPUT_TABLE_PATH = "wasbs://....blob.core.windows.net/output"
PARTITIONING_COLS = ["col1", "col2"]
EDIT2:
running this part in databricks works fine:
df = (
spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", KAFKA_SERVERS)
.option("subscribe", INPUT_TOPIC)
.option("startingOffsets", STARTING_OFFSETS)
.option("maxOffsetsPerTrigger", MAX_OFFSETS_PER_TRIGGER)
.option("failOnDataLoss", FAIL_ON_DATA_LOSS)
.option("minPartitions", MIN_PARTITIONS)
.load()
.withColumn("tmp", from_json(col("value").cast("string"), raw_message_data))
.select(f"tmp.*", *keep_original_cols)
.withWatermark(input_time_col, watermark_delay)
.groupBy(
window(col(input_time_col), window_duration).alias("period"),
)
.agg(
count("*").alias("query_count")
)
.withColumn("period_start", expr("period.start"))
.withColumn("date", expr("date(period_start)"))
.withColumn("hour", expr("hour(period_start)"))
.withColumn("minute", expr("minute(period_start)"))
)
display(df)