0

I'm trying to use hudi as source to perform aggregation in FlinkSQL,and i found hudi source is append stream but not retract stream.It makes the aggregation results accumulate incorrectly. My codes like as

create table hudi_source (
    many fields,
    primary key (primary_key) not enforced
) partitioned by (partition_key) with (
    'connector' = 'hudi',
    'hive_sync.enable' = 'true',
    'write.precombine' = 'true',
    'compaction.schedule.enabled' = 'true',
    'hoodie.datasource.write.keygenerator.type' = 'complex',
    'write.payload.class' = 'org.apache.hudi.common.model.DefaultHoodieRecordPayload',
    'read.tasks' = '2',
    'read.rate.limit' = '0',
    'write.tasks' = '2',
    'write.index_bootstrap.tasks' = '2',
    'compaction.tasks' = '2',
    'read.start-commit' = 'earliest',
    'read.streaming.enabled' = 'true',
    'read.streaming.check-interval' = '3',
    'archive.max_commits' = '100',
    'archive.min_commits' = '51',
    'clean.retain_commits' = '50',
    'compaction.delta_commits' = '100',
    'compaction.delta_seconds' = '3600',
    'filter.delete.record.enabled' = 'true',
    'hoodie.schema.evolution.enable' = 'true',
    'hive_sync.support_timestamp' = 'true',
    'hoodie.datasource.write.hive_style_partitioning' = 'false',
    'hive_sync.jdbc_url' = 'jdbc:hive2://xxx',
    'hive_sync.db' = 'xxx',
    'hive_sync.table' = 'hudi_source',
    'write.precombine.field' = 'precombine_field',
    'hoodie.datasource.write.recordkey.field' = 'primary_key',
    'path' = 'xxx',
    'hoodie.bucket.index.num.buckets' = '2',
    'index.type' = 'BUCKET',
    'table.type' = 'MERGE_ON_READ',
    'clean.async.enabled' = 'false',
    'hoodie.archive.automatic' = 'false',
    'compaction.async.enabled' = 'false',
    'hoodie.support.write.lock' = 'false'
);

insert into sink_table
select
 sum(statistic_fields)
from hudi_source
group by agg_dimension

If i use top1 force hudi source as retract stream,result is correct,code like as

create
TEMPORARY view hudi_source_v as
select *
from (select *,
             ROW_NUMBER() OVER (
                PARTITION BY primary_key
                ORDER BY
                    precombine_field desc
            ) AS row_num
      from hudi_source)
where row_num = 1;

insert into sink_table
select
 sum(statistic_fields)
from hudi_source_v
group by agg_dimension

but it makes additional resource consumption,and the optimizer connot push down predicates into the FilterableTableSource.

flink version:1.15.0 hudi-flink version:0.11.0

maple
  • 37
  • 2
  • 5
  • "Connot perform push-down predicates optimization" that's how the spark reader behave currently. Try cow if you want parquet pushdown – parisni Jul 18 '23 at 15:03

0 Answers0