2

I have a list of predefined regex pattern strings (around 7 thousand type of regex pattern to group similar type of messages).

Now I have two set to list one for regex patterns and another for real messages which contains some variable names.

I need to group all the similar messages and show those grouped messages, Now I have traverse 7000 regex patterns to group similar items in 1000 messages. It takes m*n iterations to find the correct groups.

To reduce the time of the processing, I have removed matched items from the list of messages. e.g 1000 - (matched items on the previous iteration).

It is taking too much long time to process these two lists. To reduce the time, I have grouped it on message category type and processing them in parallel tasks.

List<KBError> warningKBErrors = distinctKBErrors.Where(kbErr => kbErr.ErrorType == "Warning").ToList();
List<KBError> fatalKBErrors = distinctKBErrors.Where(kbErr => kbErr.ErrorType == "Fatal").ToList();
List<KBError> severeKBErrors = distinctKBErrors.Where(kbErr => kbErr.ErrorType == "Severe").ToList();
List<KBError> cbeccErrorKBErrors = distinctKBErrors.Where(kbErr => kbErr.ErrorType == "Error").ToList();

//Remove All error message which should be processed
errors.RemoveAll(error => !processingErrorType.HasFlag(error.ErrorType));

List<Error> warningErrors = errors.Where(kbErr => kbErr.ErrorType == ErrorType.Warning).ToList();
List<Error> fatalErrors = errors.Where(kbErr => kbErr.ErrorType == ErrorType.Fatal).ToList();
List<Error> severeErrors = errors.Where(kbErr => kbErr.ErrorType == ErrorType.Severe).ToList();
List<Error> cbeccErrors = errors.Where(kbErr => kbErr.ErrorType ==ErrorType.Error).ToList();

After that these messages are processed in the parallel task by partitioning them in the equal subset of items.

Func<List<KBError>, List<Error>, List<Error>> FindDistinctErrorMessages = (filteredKBErros, filteredErros) =>
{
    ConcurrentBag<Error> errorsList = new ConcurrentBag<Error>();


    object lockObject = new object();

    System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();
    sw.Start();


    Parallel.For(0, filteredKBErros.Count,
        () => new Dictionary<KBError, List<Error>>(),
        (x, loopState, kpErrorResult) =>
        {
            kpErrorResult.Add(filteredKBErros[(int)x], filteredErros
                .Where(error => Regex.IsMatch(error.ErrorMessage,
                    filteredKBErros[(int)x].ErrorMessage, System.Text.RegularExpressions.RegexOptions.IgnorePatternWhitespace)).ToList());
            return kpErrorResult;
        },
        (kpErrorResult) =>
        {
            lock (lockObject)
            {
                foreach (KeyValuePair<KBError, List<Error>> errorResult in kpErrorResult)
                {
                    if (errorResult.Value.Count > 0)
                    {
                        Error error = null;
                        if (errorResult.Value.Count == 1)
                        {
                            error = errorResult.Value.First();
                        }
                        else
                        {
                            error = new Error();
                            error.ErrorMessage = errorResult.Value.First().ErrorMessage;                                         
                            error.Errors = errorResult.Value;
                            error.ErrorType = errorResult.Value.First().ErrorType;
                        }
                        error.ErrorCount = errorResult.Value.Count;
                        error.ErrorCode = errorResult.Key.ErrorCode;
                        AddErrorResolutionMessage(error, errorResult.Key);
                        error.ErrorMessagePattern = errorResult.Key.ErrorMessage;
                        errors.Add(error);
                        errorResult.Value.ForEach(err => errors.Remove(err));
                    }
                }
            }
        }
        );
    sw.Stop();
    System.Diagnostics.Debug.WriteLine(string.Format("Completed in {0} seconds", sw.Elapsed.TotalSeconds));

    return errors.ToList();

};


//Filter the Warning KB List
List<KBError> filteredWarningKBList = FilterKBList(warningKBErrors, warningErrors);
List<KBError> filteredSevereKBList = FilterKBList(severeKBErrors, severeErrors);
List<KBError> filteredFatalKBList = FilterKBList(fatalKBErrors, fatalErrors);
List<KBError> filteredcbeccErrorsKBList = FilterKBList(cbeccErrorKBErrors, cbeccErrors);


List<Task<List<Error>>> tasks = new List<Task<List<Error>>>();

if (warningErrors.Count > 0 && (processingErrorType.HasFlag(ErrorType.Warning) || processingErrorType.Equals(ErrorType.All)))
{
    int equalCounts = warningErrors.Count < 10 ? 1 : warningErrors.Count / 10;
    foreach (IEnumerable<Error> subSet in warningErrors.Split(equalCounts))
    {
        tasks.Add(Task.Run<List<Error>>(() => FindDistinctErrorMessages(filteredWarningKBList, subSet.ToList()), CancellationToken.None));
    }
}

if (severeErrors.Count > 0 && (processingErrorType.HasFlag(ErrorType.Severe) || processingErrorType == ErrorType.All))
{
    int equalCounts = severeErrors.Count < 10 ? 1 : severeErrors.Count / 10;
    foreach (IEnumerable<Error> subSet in severeErrors.Split(equalCounts))
    {
        tasks.Add(Task.Run<List<Error>>(() => FindDistinctErrorMessages(filteredSevereKBList, subSet.ToList()), CancellationToken.None));
    }
}

if (fatalErrors.Count > 0 && (processingErrorType.HasFlag(ErrorType.Fatal) || processingErrorType.Equals(ErrorType.All)))
{
    int equalCounts = fatalErrors.Count < 10 ? 1 : fatalErrors.Count / 10;
    foreach (IEnumerable<Error> subSet in fatalErrors.Split(equalCounts))
    {
        tasks.Add(Task.Run<List<Error>>(() => FindDistinctErrorMessages(filteredFatalKBList, subSet.ToList()), CancellationToken.None));
    }
}

if (cbeccErrors.Count > 0 && (processingErrorType.HasFlag(ErrorType.Error) || processingErrorType.Equals(ErrorType.All)))
{
    int equalCounts = cbeccErrors.Count < 10 ? 1 : cbeccErrors.Count / 10;
    foreach (IEnumerable<Error> subSet in cbeccErrors.Split(equalCounts))
    {
        tasks.Add(Task.Run<List<Error>>(() => FindDistinctErrorMessages(filteredcbeccErrorsKBList, subSet.ToList()), CancellationToken.None));
    }
}

After starting these task, It takes lots of time to complete these tasks. wait statement for these created tasks somehow put the application in the hang state.

try
{
    List<Error> result = new List<Error>();
    Task.WaitAll(tasks.ToArray());
    foreach (var task in tasks)
    {
        result.AddRange(task.Result);
    }
    result = result.Distinct().ToList();
    result.GroupBy(res => res.ErrorMessagePattern).ToList()
        .ForEach(grp =>
        {
            Error error = grp.First();
            error.ErrorCount = grp.Sum(r => r.ErrorCount);
            if (grp.Count() > 1)
            {
                grp.ToList().ForEach(grpElement =>
                {
                    if (grpElement != error)
                    {
                        if (error.Errors == null)
                            error.Errors = new List<Error>();
                        grpElement.ErrorCount = 1;

                        if (grpElement.Errors != null && grpElement.Errors.Count > 0)
                        {
                            error.Errors.AddRange(grpElement.Errors);
                            grpElement.Errors = null;
                        }
                    }
                });
            }
            distinctErrors.Add(error);
        });
}
finally
{

}

errors.ForEach(error =>
{
    error.ErrorCount = 1;
    AddErrorResolutionMessage(error, null);
    distinctErrors.Add(error);

    if (error.PossibleResolution == "Not Found")
        logMessage.AppendLine(error.ErrorMessage);

});

Is there any better way or algorithm to reduce the time of processing these lists and reduce the time complexity of the process rather processing mxn elements?

Niranjan Singh
  • 18,017
  • 2
  • 42
  • 75
  • 7000 regex patterns? Maybe you should try optimizing this part. – dymanoid Sep 04 '17 at 10:37
  • @dymanoid: Thasks for the suggestion but I have already tried doing this and made the regex to process as fast we can. See my previous [question](https://stackoverflow.com/questions/45998447/regex-for-matching-variable-parts-in-then-string?noredirect=1#comment78961739_45998447). We are not the owner of the client database for matching patterns. We can try to improve the process not to ask to reduce your error messages. – Niranjan Singh Sep 04 '17 at 10:39
  • I would diagnose the performance of the parallel computation. Where does the CPU spends its time? Are all tasks completing at the same time? Are there regex which catch more errors than others? – Eyal Shulman Sep 04 '17 at 15:08
  • @EyalShulman: I have tried to reduce number of the task which run simultaneously. but processing that number element and parsing regex taking lots of time. – Niranjan Singh Sep 06 '17 at 06:04

0 Answers0