I have a two streaming query in one app:
landing to cleaning zone: Move new data on landing zone(raw data format) to cleaning zone and save them as a delta format
Read log data from kafka and joining with delta format table(cleaning zone) and save not-joined data into landing zone again
Below is the code (Not whole code, but core and important code):
landing_label_path = "s3a://AWS_landing_PATH/folder1"
cleansing_label_path = "s3a://AWS_cleaning_PATH/folder2"
deltaTable = None
def upsert_to_cleaned_delta(pdf, batchId):
global deltaTable
if deltaTable is None:
try:
# If first time read, delta table will not exist --> raise Exception
deltaTable = DeltaTable.forPath(spark, cleansing_label_path)
except Exception as e:
pdf.write.format("delta").partitionBy("year", "month", "day").save(cleansing_label_path)
deltaTable = DeltaTable.forPath(spark, cleansing_label_path)
return
deltaTable.alias("old_data").merge(
pdf.alias("new_data"), "old_data.token_address = new_data.token_address"
).whenMatchedUpdateAll(
).whenNotMatchedInsertAll().execute()
def main():
#
# App1. landing zone(raw parquet file) --> cleansing zone (delta format)
#
spark.sparkContext.setLocalProperty("spark.scheduler.pool", "pool1")
_ = (
spark.readStream.schema(raw_data_schema)
.parquet(landing_label_path)
.writeStream.format("delta")
.option("checkpointLocation", landing_to_cleansing_chk_path)
.foreachBatch(lambda pdf, batch_id: upsert_to_cleaned_delta(pdf, batch_id))
.trigger(processingTime="1 second")
.outputMode("append")
.start()
)
#
# App2. labeling on log streaming
#
# Wait until delta path appears
time.sleep(100)
spark.sparkContext.setLocalProperty("spark.scheduler.pool", "pool2")
complete_label_df = spark.read.format("delta").load(cleansing_label_path)
complete_label_df = complete_label_df.select(for_join_schema.fieldNames())
# From kafka
group_id = f"spark_{app_name}_{env}"
kafka_options = {
"kafka.sasl.jaas.config": f'org.apache.kafka.common.security.plain.PlainLoginModule required username="{bootstrap_username}" password="{bootstrap_password}";',
"kafka.sasl.mechanism": "PLAIN",
"kafka.security.protocol": "SASL_SSL",
"kafka.bootstrap.servers": bootstrap_server,
"group.id": group_id,
"subscribe": topic_name,
"maxOffsetsPerTrigger": 10,
}
log_streaming_df = spark.readStream.format("kafka").options(**kafka_options).load()
# decode
sr_client = SchemaRegistryClient(
{
"url": sr_server,
"basic.auth.credentials.source": "USER_INFO",
"basic.auth.user.info": f"{sr_username}:{sr_password}",
}
)
schema_obj = sr_client.get_schema(f"{topic_name}-value", version="latest")
decoded_log_streaming_df = (
log_streaming_df.selectExpr("timestamp", "substring(value, 6) as avro_value")
.select("timestamp", from_avro(col("avro_value"), json.dumps(schema_obj.schema.raw_schema)).alias("data"))
.select(
col("data.address").alias("token_address"),
col("timestamp").alias("kafka_dt"),
)
)
no_label_token_df = (
decoded_log_streaming_df.join(complete_label_df, on="token_address", how="left")
.filter(col("token_type").isNull())
)
new_label_token_df = no_label_token_df.mapInPandas(fill_the_label, label_schema)
_ = (
new_label_token_df.withColumn("update_dt", current_timestamp())
.withColumn("year", date_format(col("update_dt"), "yyyy"))
.withColumn("month", date_format(col("update_dt"), "MM"))
.withColumn("day", date_format(col("update_dt"), "dd"))
.writeStream.format("parquet")
.option("path", landing_label_path)
.option("checkpointLocation", newly_labeling_chk_path)
.partitionBy("year", "month", "day")
.trigger(processingTime="1 second")
.outputMode("append")
.start()
)
spark.streams.awaitAnyTermination()
if __name__ == "__main__":
main()
- With sparkconf:
spark.eventLog.enabled: "true"
spark.hadoop.fs.s3a.connection.maximum: "200"
spark.hadoop.fs.s3a.connection.timeout: "1200000"
spark.hadoop.fs.s3a.fast.upload: "true"
spark.hadoop.fs.s3a.impl: org.apache.hadoop.fs.s3a.S3AFileSystem
spark.hadoop.fs.s3a.input.fadvise: random
spark.hadoop.fs.s3a.path.style.access: "true"
spark.hadoop.fs.s3a.readahead.range: 256K
spark.local.dir: /local1
spark.network.timeout: "2400" # second
spark.speculation: "false"
spark.sql.catalog.spark_catalog: org.apache.spark.sql.delta.catalog.DeltaCatalog
spark.sql.extensions: io.delta.sql.DeltaSparkSessionExtension
spark.sql.parquet.compression.codec: gzip
spark.executor.heartbeatInterval: 60s
- Driver and Executor info:
driver:
annotations: {} # TODO: add .Values.annotations
labels:
version: 3.2.1
coreLimit: 1500m
cores:
memory: 4g
memoryOverhead: 2g
env:
- name: PYTHONPATH
value: /local1/python/lib
- name: ENVIRONMENT
value: dev
serviceAccount: spark-operator
volumeMounts:
- mountPath: /local1
name: spark-local-dir-1
readOnly: false
initContainers:
- command:
- sh
- -c
- chown -R 185 /local1
image: public.ecr.aws/y4g4v0z7/busybox
name: volume-permissions
volumeMounts:
- mountPath: /local1
name: spark-local-dir-1
- image: cimg/aws:2022.06
name: download-python-dependencies
command:
- sh
- -c
- |
mkdir -p /local1/python/lib
aws s3 cp /libs.zip /tmp/libs.zip
aws s3 cp /scripts.zip /tmp/scripts.zip
unzip -o /tmp/libs.zip -d /local1/python/lib
unzip -o /tmp/scripts.zip -d /local1/python/lib
volumeMounts:
- mountPath: /local1
name: spark-local-dir-1
resources:
limits:
cpu: 100m
memory: 512Mi
requests:
cpu: 100m
memory: 512Mi
securityContext:
runAsUser: 185
executor:
annotations: {} # TODO: add .Values.annotations
coreLimit: 1500m
cores:
instances: 1
labels:
version: 3.2.1
memory: 4g
memoryOverhead: 2g
podSecurityContext:
fsGroup: 185
env:
- name: PYTHONPATH
value: /local1/python/lib
- name: ENVIRONMENT
value: dev
volumeMounts:
- mountPath: /local1
name: spark-local-dir-1
readOnly: false
serviceAccount: spark-operator
initContainers:
- command:
- sh
- -c
- chown -R 185 /local1
image: public.ecr.aws/y4g4v0z7/busybox
name: volume-permissions
volumeMounts:
- mountPath: /local1
name: spark-local-dir-1
- image: cimg/aws:2022.06
name: download-python-dependencies
command:
- sh
- -c
- |
mkdir -p /local1/python/lib
aws s3 cp /libs.zip /tmp/libs.zip
aws s3 cp /scripts.zip /tmp/scripts.zip
unzip -o /tmp/libs.zip -d /local1/python/lib
unzip -o /tmp/scripts.zip -d /local1/python/lib
volumeMounts:
- mountPath: /local1
name: spark-local-dir-1
resources:
limits:
cpu: 100m
memory: 512Mi
requests:
cpu: 100m
memory: 512Mi
securityContext:
runAsUser: 185
image: datamechanics/spark:3.2.1-hadoop-3.3.1-java-11-scala-2.12-python-3.8-latest
Every time I run above code, after 10~12hours later, it raised OOM errors..
Driver memory keep increasing as time goes: