I am trying to use DLT for incremental processing where inputs are parquet files on s3 arriving daily. I am told that dlt read_stream can help . I was able to get incrementally read files, but when I perform aggregations, it is doing wide aggregations instead of aggregating only the incrementals rows. Appreciate any suggestions .
Here is the example code
@dlt.table()
def tab1():
return (spark.readStream.format("cloudFiles")
.schema(schema)
.option("cloudFiles.format", "parquet")
.option("cloudFiles.includeExistingFiles",False)
.option("cloudFiles.allowOverwrites",False)
.option("cloudFiles.validateOptions",True)
.load(f"{s3_prefix}/tab1/")
@dlt.table(
comment="Aggregate table1"
)
def tab1_agg():
return dlt.read_stream("tab1")
.groupBy("col1")
.agg(F.count(F.lit(1)).alias("cnt"),
F.sum("col2").alias("sum_col2"))
.withColumn("kh_meta_canonical_timestamp", F.current_timestamp())