I am building a windows service in C# that will monitor a job queue and when it finds item(s) available in the queue, it launches jobs that will 'completely' process the job (including failures). I'm using Task.Factory.StartNew() and admittedly, I'm very green in TAP (heading off to read blogs after finishing this post).
Requirements
Poll a database queue at regular intervals looking for available jobs. (let's ignore the argument of messaging queue vs database queue for the purpose of this question)
Launch jobs asynchronously so that the polling can continue to monitor the queue and launch new jobs.
Honor a 'job' threshold so that too many jobs do not get spawned.
'Delay' the shutdown of the service if jobs are processing.
Make sure failures in jobs are logged to the Event Log and do not crash the Windows Service.
My code is below, but here are my questions/concerns (and if there is better place to post this, let me know) but this mostly revolves around proper use of TAP and 'stability' of it. Note that most of my questions are documented in code as well.
Questions
In PollJobs, is the way I use Task.Factory.Start/ContinueWith the appropriate usage to keep the throughput high with this job processing service? I'll never be blocking any threads and hopefully using the correct pattern for the tiny bit of TAP I currently have.
ConcurrentDictionary - Using this to monitor currently running jobs, and each jobs as it finishes, removes itself from the dictionary (on separate threads I assume from the Task.Factory.StartNew), because it is ConcurrentDictionary, I assume I don't need any locks anywhere when using it?
Job Exceptions (worst one being OutOfMemoryException) - Any exceptions during job processing cannot bring down the service and must be logged correctly in Event Log and database queue. Currently there are jobs that unfortunately can throw OutOfMemoryException. Is the try/catch inside the 'job processing' good enough to catch and handle all scenarios so that the Windows Service will never terminate unexpectedly? Or would it be better/safer to launch an AppDomain for each job for more isolation? (over kill?)
I've seen arguments debating the 'proper' Timer to use with no decent answers. Any opinion on my setup and use of my System.Threading.Timer? (specifically around how I ensure PollJobs is never called again until the previous call finishes)
If you've made it this far. Thanks in advance.
Code
public partial class EvolutionService : ServiceBase
{
EventLog eventLog = new EventLog() {
Source = "BTREvolution",
Log = "Application"
};
Timer timer;
int pollingSeconds = 1;
// Dictionary of currently running jobs so I can query current workload.
// Since ConcurrentDictionary, hoping I don't need any lock() { } code.
private ConcurrentDictionary<Guid, RunningJob> runningJobs =
new ConcurrentDictionary<Guid, RunningJob>();
public EvolutionService( string[] args )
{
InitializeComponent();
if ( !EventLog.SourceExists( eventLog.Source ) )
{
EventLog.CreateEventSource(
eventLog.Source,
eventLog.Log );
}
}
protected override void OnStart( string[] args )
{
// Only run polling code one time and the PollJobs will
// initiate next poll interval so that PollJobs is never
// called again before it finishes its processing, http://stackoverflow.com/a/1698409/166231
timer = new System.Threading.Timer(
PollJobs, null,
TimeSpan.FromSeconds( 5 ).Milliseconds,
Timeout.Infinite );
}
protected override void OnPause()
{
// Disable the timer polling so no more jobs are processed
timer = null;
// Don't allow pause if any jobs are running
if ( runningJobs.Count > 0 )
{
var searcher = new System.Management.ManagementObjectSearcher(
"SELECT UserName FROM Win32_ComputerSystem" );
var collection = searcher.Get();
var username =
(string)collection
.Cast<System.Management.ManagementBaseObject>()
.First()[ "UserName" ];
throw new InvalidOperationException( $"{username} requested pause. The service will not process incoming jobs, but it must finish the {runningJobs.Count} job(s) are running before it can be paused." );
}
base.OnPause();
}
protected override void OnContinue()
{
// Tell time to start polling one time in 5 seconds
timer = new System.Threading.Timer(
PollJobs, null,
TimeSpan.FromSeconds( 5 ).Milliseconds,
Timeout.Infinite );
base.OnContinue();
}
protected override void OnStop()
{
// Disable the timer polling so no more jobs are processed
timer = null;
// Until all jobs successfully cancel, keep requesting more time
// http://stackoverflow.com/a/13952149/166231
var task = Task.Run( () =>
{
// If any running jobs, send the Cancel notification
if ( runningJobs.Count > 0 )
{
foreach ( var job in runningJobs )
{
eventLog.WriteEntry(
$"Cancelling job {job.Value.Key}" );
job.Value.CancellationSource.Cancel();
}
}
// When a job cancels (and thus completes) it'll
// be removed from the runningJobs workload monitor.
// While any jobs are running, just wait another second
while ( runningJobs.Count > 0 )
{
Task.Delay( TimeSpan.FromSeconds( 1 ) ).Wait();
}
} );
// While the task is not finished, every 5 seconds
// I'll request an additional 5 seconds
while ( !task.Wait( TimeSpan.FromSeconds( 5 ) ) )
{
RequestAdditionalTime(
TimeSpan.FromSeconds( 5 ).Milliseconds );
}
}
public void PollJobs( object state )
{
// If no jobs processed, then poll at regular interval
var timerDue =
TimeSpan.FromSeconds( pollingSeconds ).Milliseconds;
try
{
// Could define some sort of threshhold here so it
// doesn't get too bogged down, might have to check
// Jobs by type to know whether 'batch' or 'single'
// type jobs, for now, just not allowing more than
// 10 jobs to run at once.
var availableProcesses =
Math.Max( 0, 10 - runningJobs.Count );
if ( availableProcesses == 0 ) return;
var availableJobs =
JobProvider.TakeNextAvailableJobs( availableProcesses );
foreach ( var j in availableJobs )
{
// If any jobs processed, poll immediately when finished
timerDue = 0;
var job = new RunningJob
{
Key = j.jKey,
InputPackage = j.JobData.jdInputPackage,
DateStart = j.jDateStart.Value,
CancellationSource = new CancellationTokenSource()
};
// Add running job to the workload monitor
runningJobs.AddOrUpdate(
j.jKey,
job,
( key, info ) => null );
Task.Factory
.StartNew(
i =>
{
var t = (Tuple<Guid, CancellationToken>)i;
var key = t.Item1; // Job Key
// Running process would check if cancel has been requested
var token = t.Item2;
var totalProfilesProcess = 1;
try
{
eventLog.WriteEntry( $"Running job {key}" );
// All code in here completes the jobs.
// Will be a seperate method per JobType.
// Any exceptions in here *MUST NOT*
// take down service. Before allowing
// the exception to propogate back up
// into *this* try/catch, the code must
// successfully clean up any resources
// and state that was being modified so
// that the client who submitted this
// job is properly notified.
// This is just simulation of running a
// job...so each job takes 10 seconds.
for ( var d = 0; d < 10; d++ )
{
// or could do await if I called Unwrap(),
// https://blogs.msdn.microsoft.com/pfxteam/2011/10/24/task-run-vs-task-factory-startnew/
Task.Delay( 1000 ).Wait();
totalProfilesProcess++;
if ( token.IsCancellationRequested )
{
// TODO: Clean up the job
throw new OperationCanceledException( token );
}
}
// Success
JobProvider.UpdateJobStatus( key, 2, totalProfilesProcess );
}
catch ( OperationCanceledException )
{
// Cancelled
JobProvider.UpdateJobStatus( key, 3, totalProfilesProcess );
throw;
}
catch ( Exception )
{
// Failed
JobProvider.UpdateJobStatus( key, 4, totalProfilesProcess );
throw;
}
},
// Pass cancellation token to job so it can watch cancel request
Tuple.Create( j.jKey, job.CancellationSource.Token ),
// associate cancellation token with Task started via StartNew()
job.CancellationSource.Token,
TaskCreationOptions.LongRunning,
TaskScheduler.Default
).ContinueWith(
( t, k ) =>
{
// When Job is finished, log exception if present.
// Haven't tested this yet, but think
// Exception will always be AggregateException
// so I'll have to examine the InnerException.
if ( !t.IsCanceled && t.IsFaulted )
{
eventLog.WriteEntry( $"Exception for {k}: {t.Exception.Message}", EventLogEntryType.Error );
}
eventLog.WriteEntry( $"Completed job {k}" );
// Remove running job from the workload monitor
RunningJob completedJob;
runningJobs.TryRemove(
(Guid)k, out completedJob );
},
j.jKey
);
}
}
catch ( Exception ex )
{
// If can't even launch job, disable the polling.
// TODO: Could have an error threshhold where I don't
// shut down until X errors happens
eventLog.WriteEntry(
ex.Message + "\r\n\r\n" + ex.StackTrace,
EventLogEntryType.Error );
timer = null;
}
finally
{
// If timer wasn't 'terminated' in OnPause or OnStop,
// then set to call timer again
if ( timer != null )
{
timer.Change( timerDue, Timeout.Infinite );
}
}
}
}
class RunningJob
{
public Guid Key { get; set; }
public DateTime DateStart { get; set; }
public XElement InputPackage { get; set; }
public CancellationTokenSource CancellationSource { get; set; }
}