I'm trying to build a custom tumbling window with PyFlink by reading data from Kafka source. I have a Kafka input topic made by 7 partitions. My KafkaProducer sends a message every 15 secs. This is an example: Producer messages. Each message is sent to a specific partition based on the key (id_channel). Both message and key are serialized as JSON:
class Producer:
def __init__(self, topic):
self.producer = KafkaProducer(bootstrap_servers=['localhost:9092'], value_serializer=self.json_serializer, key_serializer=self.json_serializer)
self.topic = topic
def create_message(self):
channel, id_channel = self.get_channel()
id_client = str(uuid.uuid1())
os.environ['TZ'] = 'Europe\Rome'
current_millis = round(datetime.now().timestamp() * 1000)
return {
"tms": current_millis,
"id_client": id_client,
"channel": channel,
"id_channel": id_channel
}
def send(self, msg, key=None):
self.producer.send(topic=self.topic, value=msg, key=key) \
.add_callback(self.on_send_success) \
.add_errback(self.on_send_error)
@staticmethod
def json_serializer(msg):
return json.dumps(msg).encode('utf-8')
Messages are correctly sent in the "audienceInput" topic. Now, I want to read messages in Streaming mode by using the StreamTableEnvironment; apply a group by on id_channel and count how many users are logged to each channel, using tumbling windows of 60secs. I set up the environment and created Kafka Source/Sink Table with DDL:
# 1. Create streaming environment
env = StreamExecutionEnvironment.get_execution_environment()
settings = EnvironmentSettings.new_instance() \
.in_streaming_mode() \
.use_blink_planner() \
.build()
# 2. Create table environment
tbl_env = StreamTableEnvironment.create(stream_execution_environment=env,
environment_settings=settings)
tbl_conf = tbl_env.get_config().get_configuration()
# 3. Add Kafka connector dependency
kafka_jar = os.path.join(os.path.abspath(os.path.dirname(__file__) + "/jars/"),
'flink-sql-connector-kafka_2.11-1.14.2.jar')
tbl_conf.set_string("pipeline.jars", "file://{}".format(kafka_jar))
src_ddl = """
CREATE TABLE audienceInput (
tms BIGINT,
id_client STRING,
channel STRING,
id_channel STRING,
proctime AS PROCTIME()
) WITH (
'connector' = 'kafka',
'topic' = 'audienceInput',
'properties.bootstrap.servers' = 'localhost:9092',
'scan.startup.mode' = 'earliest-offset',
'value.format' = 'json',
'key.format' = 'json',
'key.fields' = 'id_channel'
)
"""
snk_ddl = """
CREATE TABLE audienceOutput (
id_channel STRING,
window_start TIMESTAMP(3),
window_end TIMESTAMP(3),
num_users INT
) WITH (
'connector' = 'kafka',
'topic' = 'audienceOutput',
'properties.bootstrap.servers' = 'localhost:9092',
'format' = 'json',
'json.ignore-parse-errors' = 'true'
)
"""
tbl_env.execute_sql(source)
tbl_env.execute_sql(sink)
# Create and initiate loading of source Table
tbl = tbl_env.from_path('audienceInput')
# Aggregate
audience_measurement = tbl.window(Tumble.over(lit(60).seconds).on(tbl.proctime).alias('w')) \
.group_by(col('w'), tbl.id_channel) \
.select(tbl.id_channel,
col('w').start.alias('window_start'),
col('w').end.alias('window_end'),
tbl.id_client.count.alias("num_users")
)
audience_measurement.execute_insert('audienceOutput').wait()
table_env.execute_sql("SELECT * FROM audienceInput").print()
However, when I execute the program the following error is raised: java.lang.ClassNotFoundException: org.apache.kafka.common.serialization.ByteArrayDeserializer. I think the problem is on the source: errors raised when parsing incoming data.
As suggested I've added the kafka-clients dependency. Now a new error is raised:Failed to execute sql.
What am I doing wrong? Am I missing any dependencies?
Thanks.