I've got a celery task which is supposed to run in infinite loop, listening to a few queues (not related to Celery internals) in RabbitMQ. When message is retrieved from queue this long running task dispatches this message to be processed by some other task.
How to implement such a use case appropriately in Celery?
I run celery with concurrency 3 and Ofair flag.
My current observation is that after a few days this setup stops processing tasks from celery internal queue. It seems that this long running tasks is being restarted for some reason and ultimately all 3 workers are busy only with this long running task, so there are no workers left to process tasks from celery queue.
I thought about some file-based lock to make sure only one worker is able to get the lock and become this long-running task, but not sure if it's a good option, i think there are better solutions for this problem.
def init_couriers_consumers(self):
logger.info("lock acquired")
logger.info("TASK ID: {}".format(init_couriers_consumers.request.id))
with Connection('amqp://guest:guest@localhost:5672//') as conn:
couriers_consumer_worker = ConsumerWorker(conn)
couriers_consumer_worker.run()
couriers_consumer_worker.should_stop = False
# cache.set('reboot', False)
self.retry(countdown=2)
class ConsumerWorker(ConsumerMixin):
def __init__(self, connection):
self.connection = connection
self._create_queues()
def _create_queues(self):
from courier.models import Courier
self.queues = []
logger.info("create_queues")
for courier in Courier.objects.filter(user__is_active=True):
logger.info("create_queue for courier: {}".format(courier.user.username))
self._create_courier_queues(courier.user.username)
def _create_courier_queues(self, courier_username):
self.queues.append(QueuesFactory.get_consumer_order_status_queue(courier_username))
self.queues.append(QueuesFactory.get_consumer_status_queue(courier_username))
self.queues.append(QueuesFactory.get_consumer_gps_queue(courier_username))
def get_consumers(self, Consumer, channel):
logger.info("Subscribing to queues: {}".format(str(self.queues)))
return [Consumer(queues=self.queues,
callbacks=[self.process_message])]
def process_message(self, body, message):
logger.info("process message")
from courier.api.tasks import process_message_task, error_handler_task
process_message_task.apply_async((message.delivery_info['routing_key'], message.payload), link_error=error_handler_task.s())
logger.info("after process message")
message.ack()
def on_connection_revived(self):
logger.info("revived")
def on_consume_ready(self, connection, channel, consumers, **kwargs):
logger.info("on consumer ready")
def on_consume_end(self, connection, channel):
logger.info("on consume end")
# def on_iteration(self):
# if cache.get('reboot'):
# logger.info("SHOULD STOP")
# self.should_stop = True
# release_lock()
Logs after fresh start:
[2016-11-14 15:47:36,652: INFO/MainProcess] Connected to amqp://guest:**@localhost:5672//
[2016-11-14 15:47:36,665: INFO/MainProcess] mingle: searching for neighbors
[2016-11-14 15:47:37,677: INFO/MainProcess] mingle: all alone
[2016-11-14 15:47:37,692: WARNING/MainProcess] celery@ip-178-216-202-251.e24cloud.com ready.
[2016-11-14 15:47:39,686: INFO/MainProcess] Received task: courier.api.consumers.init_couriers_consumers[couriers_consumer]
[2016-11-14 15:47:39,686: INFO/MainProcess] Received task: courier.api.consumers.init_producer_queues[91d7c307-8eed-4966-83ad-8b001e2459e5]
[2016-11-14 15:47:39,687: INFO/Worker-2] lock acquired
[2016-11-14 15:47:39,688: INFO/Worker-2] TASK ID: couriers_consumer
[2016-11-14 15:47:39,692: INFO/Worker-2] create_queues
[2016-11-14 15:47:40,308: INFO/Worker-2] create_queue for courier: courier1
[2016-11-14 15:47:40,322: INFO/Worker-2] revived
[2016-11-14 15:47:40,322: INFO/Worker-2] Connected to amqp://guest:**@localhost:5672//
[2016-11-14 15:47:40,325: INFO/Worker-2] Subscribing to queues: [<unbound Queue from/courier1/order/status -> <unbound Exchange couriers(direct)> -> from/courier1/order/status>, <unbound Queue from/courier1/status -> <unbound Exchange couriers(direct)> -> from/courier1/order/status>, <unbound Queue from/courier1/gps -> <unbound Exchange couriers(direct)> -> from/courier1/gps>]
[2016-11-14 15:47:40,333: INFO/Worker-2] on consumer ready
[2016-11-14 15:47:40,554: INFO/MainProcess] Task courier.api.consumers.init_producer_queues[91d7c307-8eed-4966-83ad-8b001e2459e5] succeeded in 0.864124746993s: None
But then after a few days I see (grep revived)
[2016-11-13 05:35:09,502: INFO/Worker-1] revived
[2016-11-14 05:58:17,716: INFO/Worker-3] revived
[2016-11-14 12:33:25,774: INFO/Worker-2] revived
what probably means that each worker is in this long running task, but not sure how this state occurs.