1

I wrote a pyflink consumer, which getting messages from kafka. It works fine. Here is a part of my code:

    def _read_from_kafka(self):
        bootstrap_server = self.config['source']['kafka']['host']
        bootstrap_port = self.config['source']['kafka']['port']
        kafka_topic = self.config['source']['kafka']['topic']

        deserialization_schema = self._get_serialization_schema()

        source = KafkaSource.builder() \
            .set_bootstrap_servers(f'{bootstrap_server}:{bootstrap_port}') \
            .set_topics(kafka_topic) \
            .set_starting_offsets(KafkaOffsetsInitializer.earliest()) \
            .set_value_only_deserializer(deserialization_schema) \
            .build()

        return source

    def load_data_to_pg(self):
        env = StreamExecutionEnvironment.get_execution_environment()
        env.add_jars(JDBC_JAR_PATH)

        db_host = self.config['target']['postgres']['host']
        db_port = self.config['target']['postgres']['port']
        db_name = self.config['target']['postgres']['db_name']
        db_schema = self.config['target']['postgres']['schema_name']
        db_table = self.config['target']['postgres']['table_name']
        db_username = self.config['target']['postgres']['username']
        db_password = self.config['target']['postgres']['password']
        postgres_driver = self.config['target']['postgres']['driver']

        target_attr_list = [target_attr for target_attr in self.config['rows'].keys()]
        target_attr_str = ", ".join(target_attr_list)

        source = self._read_from_kafka()

        env.from_source(source, WatermarkStrategy.no_watermarks(), "Kafka Source", self.type_info).add_sink(
            JdbcSink.sink(f"""INSERT INTO {db_schema}.{db_table}
                (
                    {target_attr_str}
                )
            values ({self.attrs_cnt_for_insert})""", self.type_info,
                          JdbcConnectionOptions.JdbcConnectionOptionsBuilder()
                          .with_url(f'jdbc:postgresql://{db_host}:{db_port}/{db_name}')
                          .with_driver_name(postgres_driver)
                          .with_user_name(db_username)
                          .with_password(db_password)
                          .build(),
                          JdbcExecutionOptions.builder()
                          .with_batch_interval_ms(1000)
                          .with_batch_size(200)
                          .with_max_retries(5)
                          .build())
        )

        env.execute("test_kafka->flink->postgres")


if __name__ == '__main__':
    logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s")

    flink_consumer = FlinkConsumer()
    flink_consumer.load_data_to_pg()

But COPY works much more faster then INSERT. Is there any way to write a query with COPY? I tried to get data from kafka using .collect_and_execute() method and to write it to stdin, but how to get this data and put in DB? I tried to write in CSV file from kafka, but got this error:

py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.flink.formats.csv.PythonCsvUtils.createCsvBulkWriterFactory.
: java.lang.IllegalAccessError: class org.apache.flink.formats.csv.PythonCsvUtils tried to access method 'org.apache.flink.api.common.serialization.BulkWriter$Factory org.apache.flink.formats.csv.CsvFileFormatFactory.createCsvBulkWrite
rFactory(org.apache.flink.shaded.jackson2.com.fasterxml.jackson.dataformat.csv.CsvSchema, org.apache.flink.table.types.logical.RowType)' (org.apache.flink.formats.csv.PythonCsvUtils is in unnamed module of loader org.apache.flink.util.
ChildFirstClassLoader @66c61024; org.apache.flink.formats.csv.CsvFileFormatFactory is in unnamed module of loader 'app')

I've tried another ways to write a query using COPY, but there always was a problem to specify a source for query (I mean source for .add_sink method, for example). I think, there is only one way: get data from kafka, then make CSV file and finally, get data from file and put it into DB, but I need to solve a problem with exception. Also, I think that creating a file is not a very convenient way. I know that sqlalchemy has method for COPY, but I need to write a query using flink methods and, if it's possible, without creating CSV file.

0 Answers0