0

Sometimes I get this error when a job in Databricks is writing in Azure data lake:

HttpRequest: 409,err=PathAlreadyExists,appendpos=,cid=f448-0832-41ac-a2ab-8821453ef3c8,rid=7d4-101f-005a-578c-f82000000,connMs=0,sendMs=0,recvMs=38,sent=0,recv=168,method=PUT,url=https://awutmp.dfs.core.windows.net/bronze/app/_delta_log/_last_checkpoint?resource=file&timeout=90

My code read from a blob storage using autoloader and write in Azure Data Lake:

Schemas:

  val binarySchema = StructType(List(
    StructField("path", StringType, true),
    StructField("modificationTime", TimestampType, true),
    StructField("length", LongType, true),
    StructField("content", BinaryType, true)
  ))

  val jsonSchema = StructType(List(
    StructField("EquipmentId", StringType, true),
    StructField("EquipmentName", StringType, true),
    StructField("EquipmentType", StringType, true),
    StructField("Name", StringType, true),
    StructField("Value", StringType, true),
    StructField("ValueType", StringType, true),
    StructField("LastSourceTimeStamp", StringType, true),
    StructField("LastReprocessDate", StringType, true),
    StructField("LastStateDuration", StringType, true),
    StructField("MessageId", StringType, true)
  ))

Create delta table if not exists:

  val sinkPath = "abfss://bronze@awutmp.dfs.core.windows.net/app"

  val tableSQL =
    s"""
  CREATE TABLE IF NOT EXISTS bronze.awutmpapp(
    path STRING,
    file_modification_time TIMESTAMP,
    file_length LONG,
    value STRING,
    json struct<EquipmentId STRING, EquipmentName STRING, EquipmentType STRING, Name STRING, Value STRING,ValueType STRING,  LastSourceTimeStamp STRING, LastReprocessDate STRING, LastStateDuration STRING, MessageId STRING>,
    job_name STRING,
    job_version STRING,
    schema STRING,
    schema_version STRING,
    timestamp_etl_process TIMESTAMP,
    year INT GENERATED ALWAYS AS (YEAR(file_modification_time)) COMMENT 'generated from file_modification_time',
    month INT GENERATED ALWAYS AS (MONTH(file_modification_time)) COMMENT 'generated from file_modification_time',
    day INT GENERATED ALWAYS AS (DAY(file_modification_time)) COMMENT 'generated from file_modification_time'
  )
  USING DELTA
  PARTITIONED BY (year, month, day)
  LOCATION '${sinkPath}'
  """
  
  spark.sql(tableSQL)


Options:


val options = Map[String, String](
  "cloudFiles.format" -> "BinaryFile",
  "cloudFiles.useNotifications" -> "true",
  "cloudFiles.queueName" -> queue,
  "cloudFiles.connectionString" -> queueConnString,
  "cloudFiles.validateOptions" -> "true",
  "cloudFiles.allowOverwrites" -> "true",
  "cloudFiles.includeExistingFiles" -> "true",
  "recursiveFileLookup" -> "true",
  "modifiedAfter" -> "2022-01-01T00:00:00.000+0000",
  "pathGlobFilter" -> "*.json.gz",
  "ignoreCorruptFiles" -> "true",
  "ignoreMissingFiles" -> "true"
)

Method process each microbatch:


def decompress(compressed: Array[Byte]): Option[String] =
  Try {
    val inputStream = new GZIPInputStream(new ByteArrayInputStream(compressed))
    scala.io.Source.fromInputStream(inputStream).mkString
  }.toOption

def binaryToStringUDF: UserDefinedFunction = {
  udf { (data: Array[Byte]) => decompress(data).orNull }
}

def processMicroBatch: (DataFrame, Long) => Unit = (df: DataFrame, id: Long) => {



  val resultDF = df
    .withColumn("content_string", binaryToStringUDF(col("content")))
    .withColumn("array_value", split(col("content_string"), "\n"))
    .withColumn("array_noempty_values", expr("filter(array_value, value -> value <> '')"))
    .withColumn("value", explode(col("array_noempty_values")))
    .withColumn("json", from_json(col("value"), jsonSchema))
    .withColumnRenamed("length", "file_length")
    .withColumnRenamed("modificationTime", "file_modification_time")
    .withColumn("job_name", lit("jobName"))
    .withColumn("job_version", lit("1.0"))
    .withColumn("schema", lit(schema.toString))
    .withColumn("schema_version", lit("1.0"))
    .withColumn("timestamp_etl_process", current_timestamp())
    .withColumn("timestamp_tz", expr("current_timezone()"))
    .withColumn("timestamp_etl_process",
      to_utc_timestamp(col("timestamp_etl_process"), col("timestamp_tz")))
    .drop("timestamp_tz", "array_value", "array_noempty_values", "content", "content_string")


  resultDF
    .write
    .format("delta")
    .mode("append")
    .option("path", sinkPath)
    .save()
}


val storagePath = "wasbs://signal@externalaccount.blob.core.windows.net/"
val checkpointPath = "/checkpoint/signal/autoloader"

spark
  .readStream
  .format("cloudFiles")
  .options(options)
  .schema(binarySchema)
  .load(storagePath)
  .writeStream
  .format("delta")
  .outputMode("append")
  .foreachBatch(processMicroBatch)
  .option("checkpointLocation", checkpointPath)
  .trigger(Trigger.AvailableNow)
  .start()
  .awaitTermination()

It is aditional information I have seen in Azure log analytics:

enter image description here

How can I solve this error?

Alex Ott
  • 80,552
  • 8
  • 87
  • 132

0 Answers0