I have a requirement to upload large CSV files and process each record of the CSV by sending it to a 3rd party.
The 3rd party can take minutes to process the records and there's a limitation of number of requests 3rd party can handle concurrently therefore I used DurableFunction for this
Here's the flow I'm using
This was working fine for a while but now I observed this was working for smaller files but when I upload large files they are not getting processed. In the logs I observed below,
2023-08-13T14:01:29Z [Verbose] Blob 'file_15000-3' will be skipped for function
'FileUploadBlobTrigger' because this blob with ETag '"0x8DB9C053CC8E3D8"' has already
been processed. PollId: '64b8b6d2-5d7a-4904-b9bc-8b2258f02ab9'. Source: 'ContainerScan'.
Blob trigger
public class FileUploadBlobTrigger
{
private readonly IfileService _fileService;
public FileUploadBlobTrigger(IfileService fileService)
{
_fileService = fileService;
}
[FunctionName(nameof(FileUploadBlobTrigger))]
public async Task Run([BlobTrigger("%My:Container%/{name}", Connection = "BlobConnectionString")] Stream myBlob,
string name, ILogger log,
[DurableClient] IDurableOrchestrationClient starter)
{
// Business logic to get dataModel
await starter.StartNewAsync(nameof(BatchProcess), dataModel);
}
}
Calls
public class BatchProcess
{
private readonly BlobTriggerConfiguration _blobTriggerConfiguration;
private readonly IFileService _fileService;
private readonly ILogger<BatchProcess> _logger;
public BatchProcess(IOptions<BlobTriggerConfiguration> blobTriggerConfiguration, IFileService fileService, ILogger<BatchProcess> logger)
{
_blobTriggerConfiguration = blobTriggerConfiguration.Value;
_fileService = fileService;
_logger = logger;
}
[FunctionName(nameof(BatchProcess))]
public async Task Run(
[OrchestrationTrigger] IDurableOrchestrationContext context)
{
foreach (var batch in batchList)
{
var batchModel = new MatchBatchModel
{ batches = batch, Name = input.Name };
var batchResponse =
await context.CallActivityAsync<BatchResponses>("Batch", batchModel);
// Combine with another model
}
}
}
[FunctionName("Batch")]
public async Task<BatchResponses> Batch(
[ActivityTrigger] BatchModel context)
{
// Process file with the 3rd party
return batchResponse;
}
For this I used following setting in host.json
"extensions": {
"durableTask": {
"maxConcurrentActivityFunctions": 1,
"maxConcurrentOrchestratorFunctions": 1
},
"blobs": {
"maxDegreeOfParallelism": 1
}
}