0

I'm implementing streaming reading from one dataset and writing to another dataset using Databricks Autoloader.

How can I apply some custom modification code to the read data before writing? E.g. something like this:

def my_modification(df):
  schema_columns = df.schema
  new_column_list = prepare_column_list(schema_columns)
  df = df.select(new_column_list)
  return df

(spark.readStream
  .format("cloudFiles")
  .option("cloudFiles.format", "json")
  .option("cloudFiles.schemaLocation", checkpoint_path)
  .load(file_path)
  .select("*", col("_metadata.file_path").alias("source_file"), current_timestamp().alias("processing_time"))
  .my_modification() # <= do it before writing and comparing the source and target schema
  .writeStream
  .option("checkpointLocation", checkpoint_path)
  .trigger(availableNow=True)
  .toTable(table_name))
archjkeee
  • 13
  • 4

0 Answers0