I researched the same problem and only found two solutions that were over-complicated and inefficient - both the most popular solution and GitLab's solution uses a webserver. I propose the following alternative which uses a file last modified timestamp:
A job using sidekiq-scheduler
set to execute every 30s. The job does only one thing - touches a temporary file in the project directory.
Note: Make sure the job runs with maximum priority, and the configuration allows for a heartbeat to be missed. If there are other long-running high priority jobs, this might produce a false positive.
config/sidekiq.yml
livenessProbe:
exec:
command: [ bin/check_worker_health ]
initialDelaySeconds: 30
periodSeconds: 30
failureThreshold: 2
app/jobs/worker_healthcheck_job.rb
##
# Updates the last access time of tmp/worker_healthcheck so the health of the worker can
# be measured by: bin/check_worker_health
class WorkerHealthcheckJob < ApplicationJob
queue_as :critical
HEALTHCHECK_FILE = Rails.root.join( 'tmp/worker_healthcheck' ).to_s
def perform
`touch #{HEALTHCHECK_FILE}`
end
end
The following is a script that would return a non-zero exit code if the file doesn't exist. After which removes the file to restart the check process:
bin/check_worker_health
#!/bin/sh
JOB_INTERVAL=30
HEALTHCHECK_FILE=$(dirname $(dirname $(readlink -f "$0")))/tmp/worker_healthcheck
# Check if $HEALTHCHECK_FILE exists
if [ -f $HEALTHCHECK_FILE ]; then
# Check if the file was created less than $JOB_INTERVAL seconds
if [ "$(( $(date +"%s") - $(date -r $HEALTHCHECK_FILE +%s) ))" -gt "$JOB_INTERVAL" ]; then
exit 2 # File created more than $JOB_INTERVAL seconds ago
fi
# Delete the file to prepare the next check
rm $HEALTHCHECK_FILE
exit 0
else
exit 1 # File does not exist
fi
And just for posterity, here is a spec that makes sure it's all working correctly:
spec/jobs/worker_healthcheck_job.rb
require 'rails_helper'
RSpec.describe WorkerHealthcheckJob, type: :job do
describe '#perform' do
it 'uses tmp/worker_healthcheck as its target file' do
expect( described_class::HEALTHCHECK_FILE ).to eq Rails.root.join( 'tmp/worker_healthcheck' ).to_s
end
context 'when the healthcheck file does not exist' do
before do
delete_healthcheck_file
end
it 'the file to exist' do
described_class.perform_now
expect( Pathname.new( described_class::HEALTHCHECK_FILE ) ).to exist
end
end
context 'when the healthcheck already exists' do
before do
create_stale_healthcheck_file
end
it 'the file to exist' do
described_class.perform_now
expect( File.mtime( described_class::HEALTHCHECK_FILE ) ).to be > 1.second.ago
end
end
end
describe '#perform_later' do
it 'enqueues the job in the critical queue' do
expect { described_class.perform_later }.to have_enqueued_job.on_queue( :critical )
end
end
describe 'bin/check_worker_health' do
context 'without a healthcheck file' do
before do
delete_healthcheck_file
end
it 'exits with code 1' do
system( Rails.root.join( 'bin/check_worker_health' ).to_s )
expect( $?.exitstatus ).to eq 1
end
end
context 'with a stale healthcheck file' do
before do
create_stale_healthcheck_file
end
it 'exits with code 2' do
system( Rails.root.join( 'bin/check_worker_health' ).to_s )
expect( $?.exitstatus ).to eq 2
end
end
context 'after the worker healthcheck job' do
before do
described_class.perform_now
end
it 'exits with code 0' do
system( Rails.root.join( 'bin/check_worker_health' ).to_s )
expect( $?.exitstatus ).to eq 0
end
end
end
def delete_healthcheck_file
File.delete( described_class::HEALTHCHECK_FILE ) if File.exist?( described_class::HEALTHCHECK_FILE )
end
def create_stale_healthcheck_file
`touch #{described_class::HEALTHCHECK_FILE} --date $(date -Is -d '1 hour ago')`
end
end