1

I have service that uploads and parses line by line into DataTable and records it via SQL bulk copy into the DB. Locally this service works fine with overall implementation in 20 secs and in LAN dev server takes a little longer in 46 secs. But when I run it on test server (the server for testing) the page is loading almost 1 minute and finally gives out '504 Gateway Time-out. The server didn't respond in time'. Although the SQL Table is being updated. If I upload less than half of the file then it works everywhere just fine. I am getting this error only on heavier (484613 lines) file. Here is the code that holds the whole logic:

public int UploadCardBins(string cardBins, out string rows, out List<string> mismatchedRows, out int fileLines)
    {
        mismatchedRows = new List<string>();
        fileLines = 0;            
        rows = null;
        int resultCode = (int)ResultCode.Ok;
        bool timeParsed = int.TryParse(ConfigurationManager.AppSettings["UploadCardBinSqlTimeOut"], out int timeOut);

        try 
        {                
            DataTable table = RetrieveCardBinFromTxtFile(cardBins, out mismatchedRows, out fileLines);               
            
            rows = table.Rows.Count.ToString();              
           
            string sql = ConfigurationManager.ConnectionStrings["DbConnection"].ConnectionString;

            using (var connection = new SqlConnection(sql))
            {
                connection.Open();                    
                SqlTransaction transaction = connection.BeginTransaction();                    
                using (var bulkCopy = new SqlBulkCopy(connection, SqlBulkCopyOptions.Default, transaction))
                {
                    bulkCopy.BatchSize = table.Rows.Count;
                    bulkCopy.DestinationTableName = "dbo.Dicts_CardBin";
                    try
                    {                                                      
                        var command = connection.CreateCommand();
                        if(timeParsed)
                            command.CommandTimeout = timeOut;                            
                        command.CommandText = "delete from dbo.Dicts_CardBin";
                        command.Transaction = transaction;
                        command.ExecuteNonQuery();                            
                        bulkCopy.WriteToServer(table);
                        Session.Flush();
                        transaction.Commit();
                    }
                    catch (Exception ex)
                    {
                        transaction.Rollback();                            
                        logger.Error("[{0}] {1}", ex.GetType(), ex);
                        resultCode = (int)ResultCode.GenericError;
                    }
                    finally
                    {
                        transaction.Dispose();                            
                        connection.Close();
                    }
                }
            }                
        }
        catch (Exception ex)
        {
            logger.Error("[{0}] {1}", ex.GetType(), ex);
            resultCode = (int)ResultCode.GenericError;
        }            
        return resultCode;
    }

private method to retrieve into DataTable:

private DataTable RetrieveCardBinFromTxtFile(string cardBins, out List<string> mismatchedLines, out int countedLines)
    {
        countedLines = 0;
        mismatchedLines = new List<string>();
        DataTable table = new DataTable();
        string pattern = @"^(?!.*[-\\/_+&!@#$%^&.,*={}();:?""])(\d.{8})\s\s\s(\d.{8})\s.{21}(\D\D)";
        MatchCollection matches = Regex.Matches(cardBins, pattern, RegexOptions.Multiline);

        table.Columns.Add("lk", typeof(string));
        table.Columns.Add("hk", typeof(string));
        table.Columns.Add("cb", typeof(string));

        // Remove empty lines at the end of the file
        string[] lines = cardBins.Split(new[] { Environment.NewLine }, StringSplitOptions.None);
        int lastIndex = lines.Length - 1;
        while (lastIndex >= 0 && string.IsNullOrWhiteSpace(lines[lastIndex]))
        {
            lastIndex--;
        }
        Array.Resize(ref lines, lastIndex + 1);

        ////Check for lines that do not match the pattern
        for (int i = 0; i < lines.Length; i++)
        {
            string line = lines[i];
            if (!Regex.IsMatch(line, pattern))
            {
                mismatchedLines.Add($"Строка {i + 1} not matching: {line}");
            }
            countedLines++;
        }

        foreach (Match match in matches)
        {
            DataRow row = table.NewRow();

            row["lk"] = match.Groups[2].Value.Trim();
            row["hk"] = match.Groups[1].Value.Trim();
            row["cb"] = match.Groups[3].Value.Trim();

            table.Rows.Add(row);
        }

        return table;
    }

I tried(I thought maybe because of SQL):

I had set in app.settings the connection timeout to 120 seconds(2minutes), but still getting the same error. Note: .net framework is 4.7 Edited by advises: I have tried so far no using try catch and it gave Transportatiion Excception, tried the Stream but it kept throwing reading and writing timeout exception on a Controller layer. I've edited out the question by adding. So I just converted stream to string cause string can get through till the implementation layer. Instead of allocating several memory initializations now I have one reference cardBins without copies. Still I have the same issue

What could be done from my side(source logic, db config, better tools for bulk copies). Threads operate and take the same amount of period by the way, tried it.

So far I cleared out the duplicates and changed the Delete to Truncate table, also changed 2 loops with one in the method of retrieving data table. Now it waroks in 13 secs instead of 22 secs of before. But still have the issue though on the side of the Test server returning 504. Thank you for all of your interest and answers.

Nara Omur
  • 21
  • 7
  • 3
    I'd suggest using [CsvDataReader](https://joshclose.github.io/CsvHelper/examples/csvdatareader/) to read the data through an IDataReader interface and pass that to SqlBulkCopy. Right now you're creating 2 extra copies of the entire data in memory. One copy in the string returned by `reader.ReadToEnd()` and 1 copy in the DataTable. If `cardBins` comes from a file or request stream you could pass the stream directly to CsvDataReader, although with large files it's always better to store them to a temporary file first, not keep them in memory – Panagiotis Kanavos Jul 20 '23 at 08:46
  • 1
    If you *don't* load everything in memory, you'll be able to import the data much faster, keeping only a single batch of rows in memory at any time. Unless you enable streaming or set a BatchSize, SqlBulkCopy will cache data in memory before sending it to the server – Panagiotis Kanavos Jul 20 '23 at 08:48
  • As far as the db load is concerned, I would do a couple of things differently. Firstly I would use a holding table in the database. That would enable you to issue a `TRUNCATE TABLE` instruction (considerably faster) rather than a `DELETE FROM`. You then do the bulk copy into the holding table. All of these operations can be done without the need for a transaction. Once you know that the data has been loaded, you can call a stored procedure which truncates your main table and copies the data from your holding table within a transaction. – Jonathan Willcock Jul 20 '23 at 08:52
  • 1
    The problem here isn't the *connection* timeout but the command timeout and the *request* timeout. Web requests aren't meant to take forever so web servers terminate requests that take too long. The 504 comes from the web server itself, or a proxy in front of it – Panagiotis Kanavos Jul 20 '23 at 09:11
  • @PanagiotisKanavos So the Garbage Collector is not playing much sense then if it might be because of the copies in the memory? If so how could I clear memory manually – Nara Omur Jul 20 '23 at 09:40
  • You should avoid generating duplicates in the first place. You can't clear up things you're actively using. *Allocating* space for 1.2M rows instead of 400K is bad enough already and causes delays – Panagiotis Kanavos Jul 20 '23 at 09:41
  • As for executing long jobs, you [need to take special care](https://www.hanselman.com/blog/how-to-run-background-tasks-in-aspnet) so the server doesn't timeout and doesn't kill the job. In ASP.NET Core you can use a BackgroundService. In ASP.NET you can use QueueBackgroundWorkItem – Panagiotis Kanavos Jul 20 '23 at 09:44
  • @JonathanWillcock Why use a temp table, seems like you could just truncate the main tabel directly. As long as it's in a transaction it can be rolled back. – Charlieface Jul 20 '23 at 12:43
  • @PanagiotisKanavos My problem is we don't have a storage for files thus we need to pass them via memory. I have added instead of bulkCopy.WriteToServer(table); this bulkCopy.WriteToServer(table.CreateDataReader()); Now it's performance 480k in 10 secs. But in test server is the same error 504 – Nara Omur Jul 20 '23 at 12:49
  • Reading other's issues with millions of records in 50 secs I doubt that 480k in 10 secs is good enough especially overall its taking 17 secs. – Nara Omur Jul 20 '23 at 12:56
  • @Charlieface I often use a temp table in these sort of situations, if I know that the upload takes a long time. It means that a transaction does not need to be held open for that time. I had in the past performance problems when doing a large bulk copy within a transaction, which disappeared when I removed the transaction. But that was several versions ago, and I have not tried recently. I am afraid that I tend to get stuck in ways that I know work! – Jonathan Willcock Jul 20 '23 at 15:02
  • 1
    @JonathanWillcock Could possibly do the `ALTER TABLE SWITCH` trick then. So instead of a temp table, use a normal staging table in the same database, same filegroup, with the same indexes. Load the staging table, then in a transaction you can truncate the main and switch staging into it, which will be very very fast as it's just a metadata move. – Charlieface Jul 20 '23 at 15:44
  • @Charlieface Yes indeed. I often do that! – Jonathan Willcock Jul 20 '23 at 17:01

1 Answers1

2

You can significantly increase the performance in the following ways:

  • Change cardBins to be a Stream. You can get this using IFormFile if you want.
  • Change RetrieveCardBinFromTxtFile (which I can't see) to both accept a Stream, and return a DbDataReader. You may want to use either the CsvHelpers or FastMember library.
  • Retrieve the mismatchedRows etc values from this reader afterwards. You would need to make a custom property on your reader that could handle this.
  • Instead of delete from, use TRUNCATE TABLE.
  • Use async everywhere.
  • You don't need all that error handling. Just make sure to put using on your transaction and connection objects, and the rollback will happen automatically.
  • async functions cannot have out parameters. You need to change that to return a tuple instead.
public async Task<(int Result, List<string> mismatchedRows, int fileLines)> UploadCardBins(Stream cardBins)
{
    bool timeParsed = int.TryParse(ConfigurationManager.AppSettings["UploadCardBinSqlTimeOut"], out int timeOut);

    try 
    {
        using var reader = new StreamReader(cardBins, System.Text.Encoding.UTF8);
        using var table = RetrieveCardBinFromTxtFile(reader);
            
        string sql = ConfigurationManager.ConnectionStrings["DbConnection"].ConnectionString;

        await using var connection = new SqlConnection(sql);
        await connection.OpenAsync();
        await using var transaction = await connection.BeginTransactionAsync();

        using (var command = new SqlCommand("TRUNCATE TABLE dbo.Dicts_CardBin;", connection, transaction)
        {
            if(timeParsed)
                command.CommandTimeout = timeOut;                            
            await command.ExecuteNonQueryAsync();
        }

        using var bulkCopy = new SqlBulkCopy(connection, SqlBulkCopyOptions.Default, transaction);
        bulkCopy.DestinationTableName = "dbo.Dicts_CardBin";
        await bulkCopy.WriteToServerAsync(table);
        Session.Flush();
        await transaction.CommitAsync();
        return ((int)ResultCode.Ok, table.MismatchedRows, table.FileLines);
    }
    catch (Exception ex)
    {
        logger.Error("[{0}] {1}", ex.GetType(), ex);
        return ((int)ResultCode.GenericError, -1, -1);
    }            
}

Consider using CancellationToken as well.

Charlieface
  • 52,284
  • 6
  • 19
  • 43
  • 'CS1988 Async methods cannot have ref' indicates to out parameters(row, mismatchedRows, fileLines) now and please see this RetrieveCardBinFromTxtFile as well: – Nara Omur Jul 21 '23 at 09:08
  • See new edits.. – Charlieface Jul 21 '23 at 09:45
  • Maybe you are using newer version but if we opened the connection and won't use try catch then the exception(if sth goes wrong with connection) thrown won't be caught. And finally connection closes so that it wouldn't stay open and stack the memory for no reason. What do you think? – Nara Omur Jul 21 '23 at 12:25
  • 1
    The outer `try catch` will catch any error. The `using` will close the connection, and dispose all the readers, so the memory won't stay around long (obviously GC collection cannot be guaranteed, but the memory will be collectible). Either way, this shouldn't matter as it's a fully streaming solution, so memory requirements will not be very high. – Charlieface Jul 21 '23 at 12:27