0

In my RAW i have around 33000 files which holds the historical data that i need to process as part of Initial Load. And daily there will be 10 new files will be coming in the landing zone. For this i have designed my code with autoloader for initial and incremental load. Below is the code snippet that i am using.

if rawFileType == 'csv' and headerValue == False:
    df = spark.read.format(schemaFileType) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", schemaFileDelimiter) \
  .load(schemaFilePath)
    mySchema = df.schema
else:
    print("the raw file is having header value")
if rawFileType == 'json' or rawFileType == 'csv' :
    inferColumnTypeValue = 'false'
elif rawFileType == 'parquet' :
    inferColumnTypeValue = 'true'
if gZipCompressed == True:
    if rawFileType == 'csv' and headerValue == False:
        raw_df = spark.readStream.format("cloudFiles")\
.option("cloudFiles.format", rawFileType)\
.schema(mySchema)\
.option("cloudFiles.inferColumnTypes",inferColumnTypeValue)\
.option("mergeSchema", "true")\
.option("cloudFiles.schemaLocation",schemaPath)\
.option("cloudFiles.allowOverwrites","true")\
.option("header",headerValue)\
.option("compression","gzip")\
.option("cloudFiles.includeExistingFiles",True)\
.option("checkpointLocation",checkpointPath)\
.option("modifiedAfter",modifiedAfterValue)\
.load(rawPath,sep = rawFileDelimiter)
    else:
        if rawFileType == 'csv' and headerValue == True:
            raw_df = spark.readStream.format("cloudFiles")\
.option("cloudFiles.format", rawFileType)\
.option("cloudFiles.inferColumnTypes",inferColumnTypeValue)\
.option("mergeSchema", "true")\
.option("cloudFiles.schemaLocation",schemaPath)\
.option("cloudFiles.allowOverwrites","true")\
.option("header",headerValue)\
.option("compression","gzip")\
.option("cloudFiles.includeExistingFiles",True)\
.option("checkpointLocation",checkpointPath)\
.option("modifiedAfter",modifiedAfterValue)\
.load(rawPath,sep = rawFileDelimiter)
        if rawFileType == 'json':
            raw_df = spark.readStream.format("cloudFiles")\
.option("cloudFiles.format", rawFileType)\
.option("cloudFiles.inferColumnTypes",inferColumnTypeValue)\
.option("mergeSchema", "true")\
.option("cloudFiles.schemaLocation",schemaPath)\
.option("cloudFiles.allowOverwrites","true")\
.option("header",headerValue)\
.option("compression","gzip")\
.option("cloudFiles.includeExistingFiles",True)\
.option("checkpointLocation",checkpointPath)\
.option("modifiedAfter",modifiedAfterValue)\
.load(rawPath)
        if rawFileType == 'parquet':
            raw_df = spark.readStream.format("cloudFiles")\
.option("cloudFiles.format", rawFileType)\
.option("cloudFiles.inferColumnTypes",inferColumnTypeValue)\
.option("mergeSchema", "true")\
.option("cloudFiles.schemaLocation",schemaPath)\
.option("cloudFiles.allowOverwrites","true")\
.option("header",headerValue)\
.option("compression","gzip")\
.option("cloudFiles.includeExistingFiles",True)\
.option("checkpointLocation",checkpointPath)\
.option("modifiedAfter",modifiedAfterValue)\
.load(rawPath)
else:
    if rawFileType == 'csv' and headerValue == False:
        raw_df = spark.readStream.format("cloudFiles")\
.option("cloudFiles.format", rawFileType)\
.schema(mySchema)\
.option("cloudFiles.inferColumnTypes",inferColumnTypeValue)\
.option("mergeSchema", "true")\
.option("cloudFiles.schemaLocation",schemaPath)\
.option("cloudFiles.allowOverwrites","true")\
.option("header",headerValue)\
.option("cloudFiles.includeExistingFiles",True)\
.option("checkpointLocation",checkpointPath)\
.option("modifiedAfter",modifiedAfterValue)\
.load(rawPath,sep = rawFileDelimiter)
    else:
        if rawFileType == 'csv' and headerValue == True:
            raw_df = spark.readStream.format("cloudFiles")\
.option("cloudFiles.format", rawFileType)\
.option("cloudFiles.inferColumnTypes",inferColumnTypeValue)\
.option("mergeSchema", "true")\
.option("cloudFiles.schemaLocation",schemaPath)\
.option("cloudFiles.allowOverwrites","true")\
.option("header",headerValue)\
.option("cloudFiles.includeExistingFiles",True)\
.option("checkpointLocation",checkpointPath)\
.option("modifiedAfter",modifiedAfterValue)\
.load(rawPath,sep = rawFileDelimiter)
        if rawFileType == 'json':
            raw_df = spark.readStream.format("cloudFiles")\
.option("cloudFiles.format", rawFileType)\
.option("cloudFiles.inferColumnTypes",inferColumnTypeValue)\
.option("mergeSchema", "true")\
.option("cloudFiles.schemaLocation",schemaPath)\
.option("cloudFiles.allowOverwrites","true")\
.option("header",headerValue)\
.option("cloudFiles.includeExistingFiles",True)\
.option("checkpointLocation",checkpointPath)\
.option("modifiedAfter",modifiedAfterValue)\
.load(rawPath)
        if rawFileType == 'parquet':
            raw_df = spark.readStream.format("cloudFiles")\
.option("cloudFiles.format", rawFileType)\
.option("cloudFiles.inferColumnTypes",inferColumnTypeValue)\
.option("mergeSchema", "true")\
.option("cloudFiles.schemaLocation",schemaPath)\
.option("cloudFiles.allowOverwrites","true")\
.option("header",headerValue)\
.option("cloudFiles.includeExistingFiles",True)\
.option("checkpointLocation",checkpointPath)\
.option("modifiedAfter",modifiedAfterValue)\
.load(rawPath)

def overwrite_microbatch(raw_df, batchId):
    raw_df.write.format("delta").mode(bronzeWriteMode).option("mergeSchema", "true").option("delta.columnMapping.mode","name").option("path",path).saveAsTable(f"{catalog_name}.{bronzeDatabaseName}.{tableName}")

raw_df.writeStream.foreachBatch(overwrite_microbatch).option("checkpointLocation",checkpointPath).option("cloudFiles.schemaEvolutionMode","addNewColumns").trigger(availableNow=True).start()

The problem that i am facing with the above code is instead of all the files the initial load is only able to process around 5000 files and rest all are getting missed.

Afterwards i have tried to set the option "cloudFiles.maxFilesPerTrigger" to 50000 and "cloudFiles.maxBytesPerTrigger" to 50GB together , yet the same issue is there.

Can someone help me to resilver this and what exactly i am missing here.

sayan nandi
  • 83
  • 1
  • 6

0 Answers0