I'm trying to use hudi as source to perform aggregation in FlinkSQL,and i found hudi source is append stream but not retract stream.It makes the aggregation results accumulate incorrectly. My codes like as
create table hudi_source (
many fields,
primary key (primary_key) not enforced
) partitioned by (partition_key) with (
'connector' = 'hudi',
'hive_sync.enable' = 'true',
'write.precombine' = 'true',
'compaction.schedule.enabled' = 'true',
'hoodie.datasource.write.keygenerator.type' = 'complex',
'write.payload.class' = 'org.apache.hudi.common.model.DefaultHoodieRecordPayload',
'read.tasks' = '2',
'read.rate.limit' = '0',
'write.tasks' = '2',
'write.index_bootstrap.tasks' = '2',
'compaction.tasks' = '2',
'read.start-commit' = 'earliest',
'read.streaming.enabled' = 'true',
'read.streaming.check-interval' = '3',
'archive.max_commits' = '100',
'archive.min_commits' = '51',
'clean.retain_commits' = '50',
'compaction.delta_commits' = '100',
'compaction.delta_seconds' = '3600',
'filter.delete.record.enabled' = 'true',
'hoodie.schema.evolution.enable' = 'true',
'hive_sync.support_timestamp' = 'true',
'hoodie.datasource.write.hive_style_partitioning' = 'false',
'hive_sync.jdbc_url' = 'jdbc:hive2://xxx',
'hive_sync.db' = 'xxx',
'hive_sync.table' = 'hudi_source',
'write.precombine.field' = 'precombine_field',
'hoodie.datasource.write.recordkey.field' = 'primary_key',
'path' = 'xxx',
'hoodie.bucket.index.num.buckets' = '2',
'index.type' = 'BUCKET',
'table.type' = 'MERGE_ON_READ',
'clean.async.enabled' = 'false',
'hoodie.archive.automatic' = 'false',
'compaction.async.enabled' = 'false',
'hoodie.support.write.lock' = 'false'
);
insert into sink_table
select
sum(statistic_fields)
from hudi_source
group by agg_dimension
If i use top1 force hudi source as retract stream,result is correct,code like as
create
TEMPORARY view hudi_source_v as
select *
from (select *,
ROW_NUMBER() OVER (
PARTITION BY primary_key
ORDER BY
precombine_field desc
) AS row_num
from hudi_source)
where row_num = 1;
insert into sink_table
select
sum(statistic_fields)
from hudi_source_v
group by agg_dimension
but it makes additional resource consumption,and the optimizer connot push down predicates into the FilterableTableSource.
flink version:1.15.0
hudi-flink version:0.11.0