I'm implementing streaming reading from one dataset and writing to another dataset using Databricks Autoloader.
How can I apply some custom modification code to the read data before writing? E.g. something like this:
def my_modification(df):
schema_columns = df.schema
new_column_list = prepare_column_list(schema_columns)
df = df.select(new_column_list)
return df
(spark.readStream
.format("cloudFiles")
.option("cloudFiles.format", "json")
.option("cloudFiles.schemaLocation", checkpoint_path)
.load(file_path)
.select("*", col("_metadata.file_path").alias("source_file"), current_timestamp().alias("processing_time"))
.my_modification() # <= do it before writing and comparing the source and target schema
.writeStream
.option("checkpointLocation", checkpoint_path)
.trigger(availableNow=True)
.toTable(table_name))