In my RAW i have around 33000 files which holds the historical data that i need to process as part of Initial Load. And daily there will be 10 new files will be coming in the landing zone. For this i have designed my code with autoloader for initial and incremental load. Below is the code snippet that i am using.
if rawFileType == 'csv' and headerValue == False:
df = spark.read.format(schemaFileType) \
.option("inferSchema", infer_schema) \
.option("header", first_row_is_header) \
.option("sep", schemaFileDelimiter) \
.load(schemaFilePath)
mySchema = df.schema
else:
print("the raw file is having header value")
if rawFileType == 'json' or rawFileType == 'csv' :
inferColumnTypeValue = 'false'
elif rawFileType == 'parquet' :
inferColumnTypeValue = 'true'
if gZipCompressed == True:
if rawFileType == 'csv' and headerValue == False:
raw_df = spark.readStream.format("cloudFiles")\
.option("cloudFiles.format", rawFileType)\
.schema(mySchema)\
.option("cloudFiles.inferColumnTypes",inferColumnTypeValue)\
.option("mergeSchema", "true")\
.option("cloudFiles.schemaLocation",schemaPath)\
.option("cloudFiles.allowOverwrites","true")\
.option("header",headerValue)\
.option("compression","gzip")\
.option("cloudFiles.includeExistingFiles",True)\
.option("checkpointLocation",checkpointPath)\
.option("modifiedAfter",modifiedAfterValue)\
.load(rawPath,sep = rawFileDelimiter)
else:
if rawFileType == 'csv' and headerValue == True:
raw_df = spark.readStream.format("cloudFiles")\
.option("cloudFiles.format", rawFileType)\
.option("cloudFiles.inferColumnTypes",inferColumnTypeValue)\
.option("mergeSchema", "true")\
.option("cloudFiles.schemaLocation",schemaPath)\
.option("cloudFiles.allowOverwrites","true")\
.option("header",headerValue)\
.option("compression","gzip")\
.option("cloudFiles.includeExistingFiles",True)\
.option("checkpointLocation",checkpointPath)\
.option("modifiedAfter",modifiedAfterValue)\
.load(rawPath,sep = rawFileDelimiter)
if rawFileType == 'json':
raw_df = spark.readStream.format("cloudFiles")\
.option("cloudFiles.format", rawFileType)\
.option("cloudFiles.inferColumnTypes",inferColumnTypeValue)\
.option("mergeSchema", "true")\
.option("cloudFiles.schemaLocation",schemaPath)\
.option("cloudFiles.allowOverwrites","true")\
.option("header",headerValue)\
.option("compression","gzip")\
.option("cloudFiles.includeExistingFiles",True)\
.option("checkpointLocation",checkpointPath)\
.option("modifiedAfter",modifiedAfterValue)\
.load(rawPath)
if rawFileType == 'parquet':
raw_df = spark.readStream.format("cloudFiles")\
.option("cloudFiles.format", rawFileType)\
.option("cloudFiles.inferColumnTypes",inferColumnTypeValue)\
.option("mergeSchema", "true")\
.option("cloudFiles.schemaLocation",schemaPath)\
.option("cloudFiles.allowOverwrites","true")\
.option("header",headerValue)\
.option("compression","gzip")\
.option("cloudFiles.includeExistingFiles",True)\
.option("checkpointLocation",checkpointPath)\
.option("modifiedAfter",modifiedAfterValue)\
.load(rawPath)
else:
if rawFileType == 'csv' and headerValue == False:
raw_df = spark.readStream.format("cloudFiles")\
.option("cloudFiles.format", rawFileType)\
.schema(mySchema)\
.option("cloudFiles.inferColumnTypes",inferColumnTypeValue)\
.option("mergeSchema", "true")\
.option("cloudFiles.schemaLocation",schemaPath)\
.option("cloudFiles.allowOverwrites","true")\
.option("header",headerValue)\
.option("cloudFiles.includeExistingFiles",True)\
.option("checkpointLocation",checkpointPath)\
.option("modifiedAfter",modifiedAfterValue)\
.load(rawPath,sep = rawFileDelimiter)
else:
if rawFileType == 'csv' and headerValue == True:
raw_df = spark.readStream.format("cloudFiles")\
.option("cloudFiles.format", rawFileType)\
.option("cloudFiles.inferColumnTypes",inferColumnTypeValue)\
.option("mergeSchema", "true")\
.option("cloudFiles.schemaLocation",schemaPath)\
.option("cloudFiles.allowOverwrites","true")\
.option("header",headerValue)\
.option("cloudFiles.includeExistingFiles",True)\
.option("checkpointLocation",checkpointPath)\
.option("modifiedAfter",modifiedAfterValue)\
.load(rawPath,sep = rawFileDelimiter)
if rawFileType == 'json':
raw_df = spark.readStream.format("cloudFiles")\
.option("cloudFiles.format", rawFileType)\
.option("cloudFiles.inferColumnTypes",inferColumnTypeValue)\
.option("mergeSchema", "true")\
.option("cloudFiles.schemaLocation",schemaPath)\
.option("cloudFiles.allowOverwrites","true")\
.option("header",headerValue)\
.option("cloudFiles.includeExistingFiles",True)\
.option("checkpointLocation",checkpointPath)\
.option("modifiedAfter",modifiedAfterValue)\
.load(rawPath)
if rawFileType == 'parquet':
raw_df = spark.readStream.format("cloudFiles")\
.option("cloudFiles.format", rawFileType)\
.option("cloudFiles.inferColumnTypes",inferColumnTypeValue)\
.option("mergeSchema", "true")\
.option("cloudFiles.schemaLocation",schemaPath)\
.option("cloudFiles.allowOverwrites","true")\
.option("header",headerValue)\
.option("cloudFiles.includeExistingFiles",True)\
.option("checkpointLocation",checkpointPath)\
.option("modifiedAfter",modifiedAfterValue)\
.load(rawPath)
def overwrite_microbatch(raw_df, batchId):
raw_df.write.format("delta").mode(bronzeWriteMode).option("mergeSchema", "true").option("delta.columnMapping.mode","name").option("path",path).saveAsTable(f"{catalog_name}.{bronzeDatabaseName}.{tableName}")
raw_df.writeStream.foreachBatch(overwrite_microbatch).option("checkpointLocation",checkpointPath).option("cloudFiles.schemaEvolutionMode","addNewColumns").trigger(availableNow=True).start()
The problem that i am facing with the above code is instead of all the files the initial load is only able to process around 5000 files and rest all are getting missed.
Afterwards i have tried to set the option "cloudFiles.maxFilesPerTrigger" to 50000 and "cloudFiles.maxBytesPerTrigger" to 50GB together , yet the same issue is there.
Can someone help me to resilver this and what exactly i am missing here.