In my project I have multiple threads feeding data to a worker pool. The worker threads wait, using a condition variable, for new data. This works well for hours and sometime days, but from time to time a worker blocks forever in its std::condition_variable::wait_for()
. No timeout and no notify wakes it. I verified that the worker thread is still there, it just never leaves the wait_for()
.
The workers job usually takes 4-6 milliseconds. On average every 5 milliseconds a new job is added by pushWork()
. Those new jobs are not created at regular intervals, but in bursts of 10-20 jobs. Instead of the complex real job I use a dummy with a sleep in my example code below.
Why can this sporadic blocking forever happen?
Simplified version of my code, reduced to the important parts:
#include <chrono>
#include <thread>
#include <condition_variable>
#include <mutex>
#include <vector>
#include <iostream>
#include <cstdlib>
#include <atomic>
class TestWorker
{
public:
enum JobState
{
free = 0,
processing
};
TestWorker() : workerThread(&TestWorker::doWork, this)
{
}
~TestWorker()
{
stopWorker = true;
if (workerThread.joinable())
{
workerTrigger.notify_one();
workerThread.join();
}
}
void pushWork(const int data)
{
while (JobState::free != state)
{
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
std::lock_guard<std::mutex> lock(workerMutex);
while (JobState::free != state)
{
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
// Set dummy data for job ...
dummyData = data;
state = JobState::processing;
workerTrigger.notify_one();
}
void doWork()
{
std::unique_lock<std::mutex> triggerLock(workerMutex);
while (!stopWorker)
{
if (JobState::processing == state)
{
// Do something for 4-6ms with dummy job data
++dummyData;
std::this_thread::sleep_for(std::chrono::milliseconds(5)); // Dummy sleep instead of real job
state = JobState::free;
}
else
{
if (!workerTrigger.wait_for(triggerLock,
std::chrono::milliseconds(500),
[this]
{ return (stopWorker || (JobState::processing == state)); }))
{
if (!stopWorker)
{
std::cout << "TIMEOUT" << std::endl;
}
else
{
std::cout << "EXIT" << std::endl;
}
}
}
}
}
std::mutex workerMutex;
std::condition_variable workerTrigger;
bool stopWorker = false;
std::thread workerThread;
std::atomic<JobState> state = { JobState::free };
// Dummy data for job ...
int dummyData = 0;
};
int main()
{
std::cout << "START" << std::endl;
std::vector<TestWorker> testWorkers(10);
for (int i = 1; i <= 1000; ++i)
{
if (0 == i % 10)
{
std::cout << "Loop #" << i << std::endl;
}
for (auto it = testWorkers.begin(); it != testWorkers.end(); ++it)
{
it->pushWork(i);
}
}
std::cout << "END" << std::endl;
return 0;
}
Compiled with: g++ workerTriggerTest.cc -o workerTriggerTest -pthread -std=c++11
gcc-Version 4.8.5 20150623 (Red Hat 4.8.5-36)
CentOS Linux release 7.6.1810