Scope:
- I want to process a large file (1 GB+) by splitting it into smaller (manageable) chunks (partitions), persist them on some storage infrastructure (local disk, blob, network, etc.) and process them one by one, in memory.
- I want to achieve this by leveraging the TPL Dataflow library and I've created several processing blocks, each of them performing a specific action, on a in-memory file partition.
- Further on, I'm using a SemaphoreSlim object to limit to max number of in-memory partitions being processed at a given time, until it is loaded and fully processed.
- I'm also using the MaxDegreeOfParallelism configuration attribute at block level to limit the degree of parallelism for each block.
From a technical perspective, the scope is to limit the processing of multiple partitions in parallel, across several continuous pipeline steps, by using a Semaphore, thus avoiding overloading the memory.
Issue description: When MaxDegreeOfParallelism is set to a value greater than 1 for all Dataflow blocks except the first one, the process hangs and seems that it reaches a deadlock. When MaxDegreeOfParallelism is set to 1, everything works as expected. Code sample below...
Do you have any idea/hint/tip why this happens?
using System;
using System.Collections.Generic;
using System.IO;
using System.Threading;
using System.Threading.Tasks;
using System.Threading.Tasks.Dataflow;
namespace DemoConsole
{
class Program
{
private static readonly SemaphoreSlim _localSemaphore = new(1);
static async Task Main(string[] args)
{
Console.WriteLine("Configuring pipeline...");
var dataflowLinkOptions = new DataflowLinkOptions() { PropagateCompletion = true };
var filter1 = new TransformManyBlock<string, PartitionInfo>(CreatePartitionsAsync, new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = 1 });
// when MaxDegreeOfParallelism on the below line is set to 1, everything works as expected; any value greater than 1 causes issues
var blockOptions = new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = 5 };
var filter2 = new TransformBlock<PartitionInfo, PartitionInfo>(ReadPartitionAsync, blockOptions);
var filter3 = new TransformBlock<PartitionInfo, PartitionInfo>(MapPartitionAsync, blockOptions);
var filter4 = new TransformBlock<PartitionInfo, PartitionInfo>(ValidatePartitionAsync, blockOptions);
var actionBlock = new ActionBlock<PartitionInfo>(async (x) => { await Task.CompletedTask; });
filter1.LinkTo(filter2, dataflowLinkOptions);
filter2.LinkTo(filter3, dataflowLinkOptions);
filter3.LinkTo(filter4, dataflowLinkOptions);
filter4.LinkTo(actionBlock, dataflowLinkOptions);
await filter1.SendAsync("my-file.csv");
filter1.Complete();
await actionBlock.Completion;
Console.WriteLine("Pipeline completed.");
Console.ReadKey();
Console.WriteLine("Done");
}
private static async Task<IEnumerable<PartitionInfo>> CreatePartitionsAsync(string input)
{
var partitions = new List<PartitionInfo>();
const int noOfPartitions = 10;
Log($"Creating {noOfPartitions} partitions from raw file on Thread [{Thread.CurrentThread.ManagedThreadId}] ...");
for (short i = 1; i <= noOfPartitions; i++)
{
partitions.Add(new PartitionInfo { FileName = $"{Path.GetFileNameWithoutExtension(input)}-p{i}-raw.json", Current = i });
}
await Task.CompletedTask;
Log($"Creating {noOfPartitions} partitions from raw file completed on Thread [{Thread.CurrentThread.ManagedThreadId}].");
return partitions;
}
private static async Task<PartitionInfo> ReadPartitionAsync(PartitionInfo input)
{
Log($"Sempahore - trying to enter for partition [{input.Current}] - Current count is [{_localSemaphore.CurrentCount}]; client thread [{Thread.CurrentThread.ManagedThreadId}]");
await _localSemaphore.WaitAsync();
Log($"Sempahore - entered for partition [{input.Current}] - Current count is [{_localSemaphore.CurrentCount}]; client thread [{Thread.CurrentThread.ManagedThreadId}]");
Log($"Reading partition [{input.Current}] on Thread [{Thread.CurrentThread.ManagedThreadId}] ...");
await Task.Delay(1000);
Log($"Reading partition [{input.Current}] completed on Thread [{Thread.CurrentThread.ManagedThreadId}].");
return input;
}
private static async Task<PartitionInfo> MapPartitionAsync(PartitionInfo input)
{
Log($"Mapping partition [{input.Current}] on Thread [{Thread.CurrentThread.ManagedThreadId}] ...");
await Task.Delay(1000);
Log($"Mapping partition [{input.Current}] completed on Thread [{Thread.CurrentThread.ManagedThreadId}].");
return input;
}
private static async Task<PartitionInfo> ValidatePartitionAsync(PartitionInfo input)
{
Log($"Validating partition [{input.Current}] on Thread [{Thread.CurrentThread.ManagedThreadId}] ...");
await Task.Delay(1000);
Log($"Validating partition [{input.Current}] completed on Thread [{Thread.CurrentThread.ManagedThreadId}].");
Log($"Sempahore - releasing - Current count is [{_localSemaphore.CurrentCount}]; client thread [{Thread.CurrentThread.ManagedThreadId}]");
_localSemaphore.Release();
Log($"Sempahore - released - Current count is [{_localSemaphore.CurrentCount}]; client thread [{Thread.CurrentThread.ManagedThreadId}]");
return input;
}
private static void Log(string message) => Console.WriteLine($"{DateTime.Now:HH:mm:ss.fff} : {message}");
}
class PartitionInfo
{
public string FileName { get; set; }
public short Current { get; set; }
}
}