I have an app (master) which distributes work to n amount of workers. Inside the worker js
I have hooked the console output as follows:
console._log = console.log;
console._error = console.error;
console.log = (...args) => {
process.send({
cmd:'log',
channel:'out',
data: args.join(' ')
});
};
console.error = (...args) => {
process.send({
cmd:'log',
channel:'err',
data: args.join(' ')
});
};
The master now is responsible of logging all incoming messages into a file besides std. Accomplished with the following code & module:
const intercept = require('intercept-stdout');
const stripAnsi = require('strip-ansi');
const unhook_intercept = intercept(function (str) {
// stdout
fs.appendFileSync(lib.logOutFile(), stripAnsi(str));
}, function (str) {
// stderr
fs.appendFileSync(lib.logErrFile(), stripAnsi(str));
});
I have noticed in the logs that a worker after 1,5 day stopped sending messages. In the master I have worker exit detection:
cluster.on('exit', (worker, code, signal) => {
if (signal) {
console.log(`${lib.dateTimeStamp()} - ${chalk.magenta('[')}${chalk.cyan(worker.process.pid)}${chalk.magenta(']')}\tWorker: ${chalk.yellow(`was killed by signal: ${signal}`)}`);
} else if (code !== 0) {
console.error(`${lib.dateTimeStamp()} - ${chalk.magenta('[')}${chalk.cyan(worker.process.pid)}${chalk.magenta(']')}\tWorker: ${chalk.red(`exited with error code: ${code}`)}`);
let newWorker = cluster.fork();
let data = work[worker.process.pid];
let d = new Date();
status[worker.process.pid].status = 'dead';
status[newWorker.process.pid] = {
started: `${d.toLocaleDateString()} ${d.toLocaleTimeString()}`,
status: 'alive'
};
delete work[worker.process.pid];
work[newWorker.process.pid] = data;
newWorker.send({
options: cfg.options,
websites: work[newWorker.process.pid]
});
} else {
delete work[worker.process.pid];
delete status[worker.process.pid]
console.log(`${lib.dateTimeStamp()} - ${chalk.magenta('[')}${chalk.cyan(worker.process.pid)}${chalk.magenta(']')}\tWorker: ${chalk.green('exited successfully')}`);
}
});
Exit was not triggered as I have seen in the logs. At the moment I have only assumptions and I'd like your opinions. Could it be because:
- The synchronous file logging.
- A worker disconnected on its own.
- A worker exited and the exit event was missed.
- Your opinion...