Sometimes I get this error when a job in Databricks is writing in Azure data lake:
HttpRequest: 409,err=PathAlreadyExists,appendpos=,cid=f448-0832-41ac-a2ab-8821453ef3c8,rid=7d4-101f-005a-578c-f82000000,connMs=0,sendMs=0,recvMs=38,sent=0,recv=168,method=PUT,url=https://awutmp.dfs.core.windows.net/bronze/app/_delta_log/_last_checkpoint?resource=file&timeout=90
My code read from a blob storage using autoloader and write in Azure Data Lake:
Schemas:
val binarySchema = StructType(List(
StructField("path", StringType, true),
StructField("modificationTime", TimestampType, true),
StructField("length", LongType, true),
StructField("content", BinaryType, true)
))
val jsonSchema = StructType(List(
StructField("EquipmentId", StringType, true),
StructField("EquipmentName", StringType, true),
StructField("EquipmentType", StringType, true),
StructField("Name", StringType, true),
StructField("Value", StringType, true),
StructField("ValueType", StringType, true),
StructField("LastSourceTimeStamp", StringType, true),
StructField("LastReprocessDate", StringType, true),
StructField("LastStateDuration", StringType, true),
StructField("MessageId", StringType, true)
))
Create delta table if not exists:
val sinkPath = "abfss://bronze@awutmp.dfs.core.windows.net/app"
val tableSQL =
s"""
CREATE TABLE IF NOT EXISTS bronze.awutmpapp(
path STRING,
file_modification_time TIMESTAMP,
file_length LONG,
value STRING,
json struct<EquipmentId STRING, EquipmentName STRING, EquipmentType STRING, Name STRING, Value STRING,ValueType STRING, LastSourceTimeStamp STRING, LastReprocessDate STRING, LastStateDuration STRING, MessageId STRING>,
job_name STRING,
job_version STRING,
schema STRING,
schema_version STRING,
timestamp_etl_process TIMESTAMP,
year INT GENERATED ALWAYS AS (YEAR(file_modification_time)) COMMENT 'generated from file_modification_time',
month INT GENERATED ALWAYS AS (MONTH(file_modification_time)) COMMENT 'generated from file_modification_time',
day INT GENERATED ALWAYS AS (DAY(file_modification_time)) COMMENT 'generated from file_modification_time'
)
USING DELTA
PARTITIONED BY (year, month, day)
LOCATION '${sinkPath}'
"""
spark.sql(tableSQL)
Options:
val options = Map[String, String](
"cloudFiles.format" -> "BinaryFile",
"cloudFiles.useNotifications" -> "true",
"cloudFiles.queueName" -> queue,
"cloudFiles.connectionString" -> queueConnString,
"cloudFiles.validateOptions" -> "true",
"cloudFiles.allowOverwrites" -> "true",
"cloudFiles.includeExistingFiles" -> "true",
"recursiveFileLookup" -> "true",
"modifiedAfter" -> "2022-01-01T00:00:00.000+0000",
"pathGlobFilter" -> "*.json.gz",
"ignoreCorruptFiles" -> "true",
"ignoreMissingFiles" -> "true"
)
Method process each microbatch:
def decompress(compressed: Array[Byte]): Option[String] =
Try {
val inputStream = new GZIPInputStream(new ByteArrayInputStream(compressed))
scala.io.Source.fromInputStream(inputStream).mkString
}.toOption
def binaryToStringUDF: UserDefinedFunction = {
udf { (data: Array[Byte]) => decompress(data).orNull }
}
def processMicroBatch: (DataFrame, Long) => Unit = (df: DataFrame, id: Long) => {
val resultDF = df
.withColumn("content_string", binaryToStringUDF(col("content")))
.withColumn("array_value", split(col("content_string"), "\n"))
.withColumn("array_noempty_values", expr("filter(array_value, value -> value <> '')"))
.withColumn("value", explode(col("array_noempty_values")))
.withColumn("json", from_json(col("value"), jsonSchema))
.withColumnRenamed("length", "file_length")
.withColumnRenamed("modificationTime", "file_modification_time")
.withColumn("job_name", lit("jobName"))
.withColumn("job_version", lit("1.0"))
.withColumn("schema", lit(schema.toString))
.withColumn("schema_version", lit("1.0"))
.withColumn("timestamp_etl_process", current_timestamp())
.withColumn("timestamp_tz", expr("current_timezone()"))
.withColumn("timestamp_etl_process",
to_utc_timestamp(col("timestamp_etl_process"), col("timestamp_tz")))
.drop("timestamp_tz", "array_value", "array_noempty_values", "content", "content_string")
resultDF
.write
.format("delta")
.mode("append")
.option("path", sinkPath)
.save()
}
val storagePath = "wasbs://signal@externalaccount.blob.core.windows.net/"
val checkpointPath = "/checkpoint/signal/autoloader"
spark
.readStream
.format("cloudFiles")
.options(options)
.schema(binarySchema)
.load(storagePath)
.writeStream
.format("delta")
.outputMode("append")
.foreachBatch(processMicroBatch)
.option("checkpointLocation", checkpointPath)
.trigger(Trigger.AvailableNow)
.start()
.awaitTermination()
It is aditional information I have seen in Azure log analytics:
How can I solve this error?