I am slowly working my way into the world of docker compose. I would like to create a data pipeline. I think something is not working with my connector?! The CSV I send later to Kafka is read in. But then the data from the connector is not sent to the HDFS?! I think I have something wrong with the connector?
version: '3.9'
services:
db: image: postgres:9.6
container_name: kong-db
environment:
POSTGRES_USER: kong
POSTGRES_DB: kong
POSTGRES_PASSWORD: kongpass
restart: on-failure
volumes:
- kong-data:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U kong"]
interval: 30s
timeout: 10s
retries: 3
kong-migrations:
image: kong
container_name: kong-migration
environment:
KONG_DATABASE: postgres
KONG_PG_HOST: db
KONG_PG_PASSWORD: kongpass
command: kong migrations bootstrap
restart: on-failure
depends_on:
- db
kong-gateway:
image: kong
container_name: kong-gateway
environment:
KONG_DATABASE: postgres
KONG_PG_HOST: db
KONG_PG_USER: kong
KONG_PG_PASSWORD: kongpass
KONG_PROXY_ACCESS_LOG: /dev/stdout
KONG_ADMIN_ACCESS_LOG: /dev/stdout
KONG_PROXY_ERROR_LOG: /dev/stderr
KONG_ADMIN_ERROR_LOG: /dev/stderr
KONG_ADMIN_LISTEN: 0.0.0.0:8001
KONG_ADMIN_GUI_URL: http://localhost:8002
restart: on-failure
ports:
- "8000:8000"
- "8443:8443"
- "8001:8001"
- "8444:8444"
- "8002:8002"
- "8445:8445"
- "8003:8003"
- "8004:8004"
depends_on:
- kong-migrations
healthcheck:
test: ["CMD-SHELL", "curl -s -o /dev/null http://localhost:8001/status || echo 'error'"]
interval: 30s
timeout: 10s
retries: 3
kong-dashboard:
image: pantsel/konga
container_name: konga-dashboard
ports:
- 1337:1337
restart: on-failure
depends_on:
- kong-gateway
- db
healthcheck:
test: ["CMD-SHELL", "nc -z -w 1 localhost 1337"]
interval: 30s
timeout: 10s
retries: 3
# under: curl -fL -o NUL -w "%{http_code}" http://localhost:1337/register must be http_code: 200 !
zookeeper:
image: confluentinc/cp-zookeeper:latest
environment:
ZOOKEEPER_CLIENT_PORT: 2181
ZOOKEEPER_TICK_TIME: 2000
ports:
- 22181:2181
healthcheck:
test: ["CMD", "nc", "-vz", "localhost", "2181"]
interval: 30s
timeout: 10s
retries: 3
kafka: image: confluentinc/cp-kafka:latest
ports:
- 29092:29092
environment:
KAFKA_BROKER_ID: 1
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092,PLAINTEXT_HOST://localhost:29092
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
KAFKA_CREATE_TOPICS: csv-topic:1:1
#'csv-topic:1:1'
depends_on:
- zookeeper
healthcheck:
test: ["CMD", "nc", "-vz", "localhost", "9092"]
interval: 30s
timeout: 10s
retries: 3
kafka-hdfs-connector:
image: confluentinc/cp-kafka-connect:latest
container_name: kafka-hdfs-connector
environment:
CONNECT_BOOTSTRAP_SERVERS: kafka:9092
CONNECT_REST_PORT: 8083
CONNECT_GROUP_ID: connect-cluster
CONNECT_CONFIG_STORAGE_TOPIC: connect-configs
CONNECT_OFFSET_STORAGE_TOPIC: connect-offsets
CONNECT_STATUS_STORAGE_TOPIC: connect-statuses
CONNECT_KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter
CONNECT_VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter
CONNECT_KEY_CONVERTER_SCHEMAS_ENABLE: false
CONNECT_VALUE_CONVERTER_SCHEMAS_ENABLE: false
CONNECT_PLUGIN_PATH: /usr/share/java,/etc/kafka-connect/jars
CONNECT_HDFS_URL: hdfs://namenode:9000
#8020
CONNECT_HDFS_TOPIC_DIR: /kafka-data
CONNECT_REST_ADVERTISED_HOST_NAME: kafka-con
CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: 1
CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: 1
CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: 1
CONNECTOR_CLASS: io.confluent.connect.hdfs.HdfsSinkConnector
CONNECTOR_NAME: hdfs-connector
CONNECTOR_TASKS_MAX: 1
CONNECTOR_TOPICS: csv-topic
CONNECTOR_HDFS_FLUSH_SIZE: 1000
CONNECTOR_HDFS_COMPRESSION_TYPE: gzip
depends_on:
- kafka
- namenode
namenode:
image: bde2020/hadoop-namenode
#bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8
container_name: namenode
restart: always
ports:
- 9870:9870
- 9000:9000
volumes:
- hadoop_namenode:/hadoop/dfs/name
environment:
CLUSTER_NAME: test
env_file:
- ./hadoop.env
datanode:
image: bde2020/hadoop-datanode
#bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
container_name: datanode
restart: always
volumes:
- hadoop_datanode:/hadoop/dfs/data
environment:
SERVICE_PRECONDITION: namenode:9870
env_file:
- ./hadoop.env
jupyter-pyspark-notebook:
image: jupyter/pyspark-notebook
container_name: jupyter-pyspark-notebook
ports:
- "8888:8888"
volumes:
- hadoop_datanode:/home/jovyan/work:ro
environment:
SERVICE_PRECONDITION: datanode:9864
depends_on:
- kafka-hdfs-connector
volumes:
kong-data: name: kong-db-volume hadoop_namenode: name: hadoop-nn-volume hadoop_datanode: name: hadoop-dn-volume hadoop_historyserver: name: hadoop-hm-volume
networks: default: name: kong-net
Checking the logs, changing the image version, various changes in the environments, but nothing helped