0

I have data streaming from kafka on an input topic. Using that data, I have to query another table (either with kafka connector or file system connector) and check if a value does not exist and finally publish the to an output kafka topic. I am trying to write a not exists query but unfortunately I am not able to achieve append only result and the output topic cannot handle updating results. **Exists **query works fine. Here are the tables.


create table input
(
    `sessionid` string,
    `usern` string,
    `time_sql_timestamp` TIMESTAMP(3) METADATA FROM 'timestamp' VIRTUAL,
    `context_time` TIMESTAMP(3),
    `proc-time` AS PROCTIME(),
    WATERMARK FOR time_sql_timestamp AS time_sql_timestamp - INTERVAL '5' SECOND)
    WITH ( 'connector' = 'kafka','topic' = 'input','properties.bootstrap.servers' = 'kafka:9092',
        'json.ignore-parse-errors' = 'true',
        'json.fail-on-missing-field'= 'false',
        'properties.group.id' = 'input',
        'scan.startup.mode' = 'latest-offset',
        'format' = 'json');


create table
    context(
               id string,
               username string,
               `file.path` STRING NOT NULL METADATA,
               `file.modification-time` TIMESTAMP_LTZ(3) NOT NULL METADATA
                
 ) WITH ('connector' = 'filesystem','path' = '/tmp/context','format' = 'csv', 'source.monitor-interval' = '10');


create table
    output
(
    `sessionid` string,
    `usern` string,
    `time_sql_timestamp` TIMESTAMP(3),
    `context_time` TIMESTAMP(3),
    `proc-time` TIMESTAMP(3),
    `filepath` string)
     WITH ( 'connector' = 'kafka','topic' = 'output',
        'properties.bootstrap.servers' = 'kafka:9092',
        'json.ignore-parse-errors' = 'true',
        'json.fail-on-missing-field'= 'false',
        'properties.group.id' = 'input',
        'scan.startup.mode' = 'latest-offset',
        'format' = 'json');

I want to insert data in output table if a user present in input table does not exists in context table at a given time.

Sql that works -

Flink SQL> INSERT INTO output select input.*, '' as file_path from input, context where exists (select * from input where usern = context.username and context.`file.modification-time` = context_time); [INFO] Submitting SQL update statement to the cluster... [INFO] SQL update statement has been successfully submitted to the cluster: Job ID: 92e305d15100d6ea2588e67afe375a84

Sql that does not work-

Flink SQL> INSERT INTO output select input.*, '' as file_path from input, context where not exists (select * from input where usern = context.username and co ntext.`file.modification-time`= context_time); [INFO] Submitting SQL update statement to the cluster... [ERROR] Could not execute SQL statement. Reason: org.apache.flink.table.api.TableException: Table sink 'default_catalog.default_database.output' doesn't support consuming update and delete changes which is produced by node Join(joinType=[InnerJoin], where=[true], select=[sessionid, usern, time_sql_timestamp, context_time, proc-time, username, file.modification-time], leftInputSpec=[NoUniqueKey], rightInputSpec=[NoUniqueKey])

I also tried windowed query like this - Exists works-

Flink SQL> INSERT INTO output WITH input_cte AS (SELECT time_sql_timestamp, window_start, window_end, window_time FROM (SELECT * from TABLE (TUMBLE(TABLE input, DESCRIPTOR(time_sql_timestamp), INTERVAL '10' SECOND)), context where (usern = context.username and context.`file.modification-time` = context_time) ORDE
R BY time_sql_timestamp)) (SELECT i.*, '' as file_path FROM input_cte AS w, input as i WHERE i.time_sql_timestamp BETWEEN ASYMMETRIC w.window_start AND w.win
dow_end and EXISTS (SELECT * from context where context.username = i.usern and context.`file.modification-time` = i.context_time));
[INFO] Submitting SQL update statement to the cluster...
[INFO] SQL update statement has been successfully submitted to the cluster:
Job ID: 1c8d1927fe42104252f2b9e889a689bb

NOT EXISTS does not work

Flink SQL> INSERT INTO output WITH input_cte AS (SELECT time_sql_timestamp, window_start, window_end, window_time FROM (SELECT * from TABLE (TUMBLE(TABLE input, DESCRIPTOR(time_sql_timestamp), INTERVAL '10' SECOND)), context where (usern = context.username and context.`file.modification-time` = context_time) ORDER BY time_sql_timestamp)) (SELECT i.*, '' as file_path FROM input_cte AS w, input as i WHERE i.time_sql_timestamp BETWEEN ASYMMETRIC w.window_start AND w.window_end and not EXISTS (SELECT * from context where context.username = i.usern and context.`file.modification-time` = i.context_time));
[INFO] Submitting SQL update statement to the cluster...
[ERROR] Could not execute SQL statement. Reason:
org.apache.flink.table.api.TableException: Table sink 'default_catalog.default_database.output' doesn't support consuming update and delete changes which is produced by node Join(joinType=[LeftAntiJoin], where=[((username = usern) AND (file.modification-time = context_time0))], select=[sessionid, usern, time_sql_timestamp0, context_time, proc-time, context_time0], leftInputSpec=[NoUniqueKey], rightInputSpec=[NoUniqueKey])

Any help on how I can achieve this use case will be greatly appreciated. Thanks.

Neha
  • 225
  • 1
  • 5
  • 12

0 Answers0