Im working on a project that uses kafka producer and consumer in order to acquire articles (with specific topics) from a news_api every two hours and then with a consumer save them in a mongodb.
So i made three classes one for KafkaAdminClient one for KafkaProducer and one for KafkaConsumer.
The servers for my kafka are running on a docker container. The main application is a flask app, and thats where i start all the threads and the ones for kafka.
Ive been trying to change a lot of little things but it just seems very unstable and i dont know why. Firstly the data gets on the consumer and eventually in the mongodb at a random time. Then the old topics in the consumer dont get deleted and the database keeps getting populated with new and old values.
Now that i put a group in the consumer and i added the kafkaAdminClient class i dont get messages in the consumer at all. All i get is this:
articleretrieval-flask_api-1 | WARNING:kafka.cluster:Topic health is not available during auto-create initialization articleretrieval-flask_api-1 | WARNING:kafka.cluster:Topic business is not available during auto-create initialization articleretrieval-flask_api-1 | WARNING:kafka.cluster:Topic war is not available during auto-create initialization articleretrieval-flask_api-1
| WARNING:kafka.cluster:Topic motorsport is not available during auto-create initialization articleretrieval-flask_api-1
| WARNING:kafka.cluster:Topic sources is not available during auto-create initialization articleretrieval-flask_api-1
| WARNING:kafka.cluster:Topic science is not available during auto-create initialization articleretrieval-flask_api-1
| WARNING:kafka.cluster:Topic technology is not available during auto-create initialization articleretrieval-flask_api-1
| WARNING:kafka.cluster:Topic education is not available during auto-create initialization articleretrieval-flask_api-1
| WARNING:kafka.cluster:Topic space is not available during auto-create initialization articleretrieval-flask_api-1
| INFO:kafka.consumer.subscription_state:Updated partition assignment: [] articleretrieval-flask_api-1
| INFO:kafka.conn:<BrokerConnection node_id=bootstrap-0 host=kafka:29092 <connected> [IPv4 ('172.19.0.4', 29092)]>: Closing connection.
kafkaConsumerThread.py:
class KafkaConsumerThread:
def __init__(self, topics, db,logger):
self.topics = topics
self.db = db
self.logger = logger
def start(self):
self.logger.debug("Getting the kafka consumer")
try:
consumer = KafkaConsumer(bootstrap_servers=['kafka:29092'],
auto_offset_reset='earliest',
# group_id='my_group',
enable_auto_commit=False,
value_deserializer=lambda x: json.loads(x.decode('utf-8')))
except NoBrokersAvailable as err:
self.logger.error("Unable to find a broker: {0}".format(err))
time.sleep(1)
consumer.subscribe(self.topics + ["sources"])
for message in consumer:
self.logger(message)
if message.topic == "sources":
self.db.insert_source_info(message.value["source_name"], message.value["source_info"])
else:
self.db.insert_article(message.topic, [message.value])
def on_send_success(record_metadata):
return
# print(record_metadata.topic)
# print(record_metadata.partition)
def on_send_error(excp):
print(excp)
def call_apis(self, topics, news_api, media_api):
try:
producer = KafkaProducer(bootstrap_servers=['kafka:29092'],
max_block_ms=100000,
value_serializer=lambda x: json.dumps(x).encode('utf-8'))
except NoBrokersAvailable as err:
# self.logger.error("Unable to find a broker: {0}".format(err))
time.sleep(1)
domains = []
try:
if producer:
for topic in topics:
articles = news_api.get_articles(topic)
for article in articles:
if article['source'] != '':
if article['source'] not in domains:
domains.append(article['source'])
producer.send(topic, value=article).add_callback(on_send_success).add_errback(on_send_error)
producer.flush()
for domain in domains:
source_info = media_api.get_source_domain_info(domain)
if source_info:
producer.send("sources", value={"source_name": domain, "source_info": source_info}).add_callback(on_send_success).add_errback(on_send_error)
# Flush the producer to ensure all messages are sent
producer.flush()
except AttributeError:
self.logger.error("Unable to send message. The producer does not exist.")
class KafkaProducerThread:
def __init__(self, topics,logger):
self.topics = topics
self.news_api = NewsApi()
self.media_api = MediaWikiApi()
self.logger = logger
def start(self):
# Call the APIs immediately when the thread starts
call_apis(self, self.topics, self.news_api, self.media_api)
# Use a timer to schedule the next API call
timer = Timer(7200, self.start)
timer.start()
kafkaAdminClient.py:
class KafkaAdminThread:
def __init__(self,topics):
self.topics = topics
def start(self):
admin_client = KafkaAdminClient(
bootstrap_servers=['kafka:29092'],
client_id='my_client'
)
topic_list = []
for topic in self.topics:
topic_list.append(NewTopic(name=topic, num_partitions=1, replication_factor=1))
admin_client.create_topics(new_topics=topic_list, validate_only=False)
app.py:
if __name__ == "__main__":
# Creating a new connection with mongo
# threading.Thread(target=lambda: app.run(port=8080, host="0.0.0.0",debug=True,use_reloader=False)).start()
executor = ThreadPoolExecutor(max_workers=4)
producerThread = KafkaProducerThread(TOPICS,logging)
adminThread = KafkaAdminThread(TOPICS)
executor.submit(adminThread.start)
flaskThread = threading.Thread(target=lambda: app.run(port=8080, host="0.0.0.0", debug=True, use_reloader=False))
executor.submit(flaskThread.start())
time.sleep(15)
executor.submit(producerThread.start)
consumerThread = KafkaConsumerThread(TOPICS, db,logging)
executor.submit(consumerThread.start)
docker-compose.yml:
zookeeper:
image: wurstmeister/zookeeper
ports:
- "2181:2181"
kafka:
container_name: kafka_broker_1
image: wurstmeister/kafka
links:
- zookeeper
ports:
- "9092:9092"
- "29092:29092"
depends_on:
- zookeeper
environment:
KAFKA_ADVERTISED_HOSTNAME: kafka
KAFKA_ADVERTISED_LISTENERS: INSIDE://kafka:29092,OUTSIDE://localhost:9092
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: INSIDE:PLAINTEXT,OUTSIDE:PLAINTEXT
KAFKA_LISTENERS: INSIDE://0.0.0.0:29092,OUTSIDE://0.0.0.0:9092
KAFKA_INTER_BROKER_LISTENER_NAME: INSIDE
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
volumes:
- /var/run/docker.sock:/var/run/docker.sock%
flask_api:
build:
context: . #Very important it refers where the root will be for the build.
dockerfile: Dockerfile
links:
- kafka
environment:
- FLASK-KAFKA_BOOTSTRAP-SERVERS=kafka:29092
- SERVER_PORT=8080
ports:
- "8080:8080"
depends_on:
- kafka