I wrote a pyflink consumer, which getting messages from kafka. It works fine. Here is a part of my code:
def _read_from_kafka(self):
bootstrap_server = self.config['source']['kafka']['host']
bootstrap_port = self.config['source']['kafka']['port']
kafka_topic = self.config['source']['kafka']['topic']
deserialization_schema = self._get_serialization_schema()
source = KafkaSource.builder() \
.set_bootstrap_servers(f'{bootstrap_server}:{bootstrap_port}') \
.set_topics(kafka_topic) \
.set_starting_offsets(KafkaOffsetsInitializer.earliest()) \
.set_value_only_deserializer(deserialization_schema) \
.build()
return source
def load_data_to_pg(self):
env = StreamExecutionEnvironment.get_execution_environment()
env.add_jars(JDBC_JAR_PATH)
db_host = self.config['target']['postgres']['host']
db_port = self.config['target']['postgres']['port']
db_name = self.config['target']['postgres']['db_name']
db_schema = self.config['target']['postgres']['schema_name']
db_table = self.config['target']['postgres']['table_name']
db_username = self.config['target']['postgres']['username']
db_password = self.config['target']['postgres']['password']
postgres_driver = self.config['target']['postgres']['driver']
target_attr_list = [target_attr for target_attr in self.config['rows'].keys()]
target_attr_str = ", ".join(target_attr_list)
source = self._read_from_kafka()
env.from_source(source, WatermarkStrategy.no_watermarks(), "Kafka Source", self.type_info).add_sink(
JdbcSink.sink(f"""INSERT INTO {db_schema}.{db_table}
(
{target_attr_str}
)
values ({self.attrs_cnt_for_insert})""", self.type_info,
JdbcConnectionOptions.JdbcConnectionOptionsBuilder()
.with_url(f'jdbc:postgresql://{db_host}:{db_port}/{db_name}')
.with_driver_name(postgres_driver)
.with_user_name(db_username)
.with_password(db_password)
.build(),
JdbcExecutionOptions.builder()
.with_batch_interval_ms(1000)
.with_batch_size(200)
.with_max_retries(5)
.build())
)
env.execute("test_kafka->flink->postgres")
if __name__ == '__main__':
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(message)s")
flink_consumer = FlinkConsumer()
flink_consumer.load_data_to_pg()
But COPY works much more faster then INSERT. Is there any way to write a query with COPY? I tried to get data from kafka using .collect_and_execute() method and to write it to stdin, but how to get this data and put in DB? I tried to write in CSV file from kafka, but got this error:
py4j.protocol.Py4JJavaError: An error occurred while calling z:org.apache.flink.formats.csv.PythonCsvUtils.createCsvBulkWriterFactory.
: java.lang.IllegalAccessError: class org.apache.flink.formats.csv.PythonCsvUtils tried to access method 'org.apache.flink.api.common.serialization.BulkWriter$Factory org.apache.flink.formats.csv.CsvFileFormatFactory.createCsvBulkWrite
rFactory(org.apache.flink.shaded.jackson2.com.fasterxml.jackson.dataformat.csv.CsvSchema, org.apache.flink.table.types.logical.RowType)' (org.apache.flink.formats.csv.PythonCsvUtils is in unnamed module of loader org.apache.flink.util.
ChildFirstClassLoader @66c61024; org.apache.flink.formats.csv.CsvFileFormatFactory is in unnamed module of loader 'app')
I've tried another ways to write a query using COPY, but there always was a problem to specify a source for query (I mean source for .add_sink method, for example). I think, there is only one way: get data from kafka, then make CSV file and finally, get data from file and put it into DB, but I need to solve a problem with exception. Also, I think that creating a file is not a very convenient way. I know that sqlalchemy has method for COPY, but I need to write a query using flink methods and, if it's possible, without creating CSV file.