-3

I'm playing with Levenshtein distance for getting a C# implementation which allows not only to tell whether two strings are similar, but also find a similar string (the needle) in a larger string (the haystack).

To this end, I tried to follow the advice at the bottom of this excellent post, but I'm getting some issues.

To start with, I adopted this implementation, changing it to fit my additional requirements. I also added some diagnostic dump support to let me understand the algorithm better, inspired by this other post.

My implementation returns an object with score and (when requested) index and length, and also a reference to the calculated matrix used for diagnostic purposes:

public class LevenshteinMatch
{
    public int Score { get; }
    public int Index { get; }
    public int Length { get; }
    public int[,] Matrix { get; set; }

    public LevenshteinMatch(int score, int index = 0, int length = 0)
    {
        Score = score;
        Index = index;
        Length = length;
    }

    public override string ToString()
    {
        return $"{Score} @{Index}x{Length}";
    }
}

Here is my implementation: the Distance method works "normally" if sub is false; otherwise, it finds a similar substring. DumpMatrix is just a diagnostic helper method.

public static class Levenshtein
{
    public static string DumpMatrix(int[,] d, string a, string b)
    {
        if (d == null) throw new ArgumentNullException(nameof(d));
        if (a == null) throw new ArgumentNullException(nameof(a));
        if (b == null) throw new ArgumentNullException(nameof(b));

        //      #  k  i  t  t  e  n  
        //      00 01 02 03 04 05 06 
        // # 00 .. .. .. .. .. .. .. 
        // s 01 .. .. .. .. .. .. .. 
        // ...etc (sitting)

        StringBuilder sb = new StringBuilder();
        int n = a.Length;
        int m = b.Length;

        // b-legend
        sb.Append("     #  ");
        for (int j = 0; j < m; j++) sb.Append(b[j]).Append("  ");
        sb.AppendLine();
        sb.Append("     00 ");
        for (int j = 1; j < m; j++) sb.AppendFormat("{0:00}", j).Append(' ');
        sb.AppendFormat("{0:00} ", m).AppendLine();

        // matrix
        for (int i = 0; i <= n; i++)
        {
            // a-legend
            if (i == 0)
            {
                sb.Append("# 00 ");
            }
            else
            {
                sb.Append(a[i - 1])
                  .Append(' ')
                  .AppendFormat("{0:00}", i)
                  .Append(' ');
            }

            // row of values
            for (int j = 0; j <= m; j++)
                sb.AppendFormat("{0,2} ", d[i, j]);
            sb.AppendLine();
        }
        return sb.ToString();
    }

    private static LevenshteinMatch BuildMatch(string a, string b, int[,] d)
    {
        int n = a.Length;
        int m = b.Length;

        // take the min rightmost score instead of the bottom-right corner
        int min = 0, rightMinIndex = -1;
        for (int j = m; j > -1; j--)
        {
            if (rightMinIndex == -1 || d[n, j] < min)
            {
                min = d[n, j];
                rightMinIndex = j;
            }
        }

        // corner case: perfect match, just collect m chars from score=0
        if (min == 0)
        {
            return new LevenshteinMatch(min,
                rightMinIndex - n,
                n);
        }

        // collect all the lowest scores on the bottom row leftwards,
        // up to the length of the needle
        int count = n, leftMinIndex = rightMinIndex;
        while (leftMinIndex > -1)
        {
            if (d[n, leftMinIndex] == min && --count == 0) break;
            leftMinIndex--;
        }

        return new LevenshteinMatch(min,
            leftMinIndex - 1,
            rightMinIndex + 1 - leftMinIndex);
    }

    public static LevenshteinMatch Distance(string a, string b,
        bool sub = false, bool withMatrix = false)
    {
        if (a is null) throw new ArgumentNullException(nameof(a));
        if (b == null) throw new ArgumentNullException(nameof(b));

        int n = a.Length;
        int m = b.Length;
        int[,] d = new int[n + 1, m + 1];

        if (n == 0) return new LevenshteinMatch(m);
        if (m == 0) return new LevenshteinMatch(n);

        for (int i = 0; i <= n; i++) d[i, 0] = i;
        // if matching substring, leave the top row to 0
        if (!sub)
        {
            for (int j = 0; j <= m; j++) d[0, j] = j;
        }

        for (int j = 1; j <= m; j++)
        {
            for (int i = 1; i <= n; i++)
            {
                if (a[i - 1] == b[j - 1])
                {
                    d[i, j] = d[i - 1, j - 1];  // no operation
                }
                else
                {
                    d[i, j] = Math.Min(Math.Min(
                        d[i - 1, j] + 1,    // a deletion
                        d[i, j - 1] + 1),   // an insertion
                        d[i - 1, j - 1] + 1 // a substitution
                        );
                }
            }
        }

        LevenshteinMatch match = sub
            ? BuildMatch(a, b, d)
            : new LevenshteinMatch(d[n, m]);
        if (withMatrix) match.Matrix = d;
        return match;
    }
}

To be more complete, here is the demo console program using it. This just prompts the user for the matching mode (substring or not) and the two strings, then calls the Distance method, dumps the resulting matrix, and shows the substring if required.

internal static class Program
{
    private static string ReadLine(string defaultLine)
    {
        string s = Console.ReadLine();
        return string.IsNullOrEmpty(s) ? defaultLine ?? s : s;
    }

    private static void Main()
    {
        Console.WriteLine("Fuzzy Levenshtein Matcher");

        string a = "sitting", b = "kitten";
        bool sub = false;
        LevenshteinMatch match;

        while (true)
        {
            Console.Write("sub [y/n]? ");
            string yn = Console.ReadLine();
            if (!string.IsNullOrEmpty(yn)) sub = yn == "y" || yn == "Y";

            Console.Write(sub? $"needle ({a}): " : $"A ({a}): ");
            a = ReadLine(a);

            Console.Write(sub? $"haystack ({b}): " : $"B ({b}): ");
            b = ReadLine(b);

            match = Levenshtein.Distance(a, b, sub, true);
            Console.WriteLine($"{a} - {b}: {match}");
            Console.WriteLine(Levenshtein.DumpMatrix(match.Matrix, a, b));
            if (sub) Console.WriteLine(b.Substring(match.Index, match.Length));
        }
    }
}

Now, for substring matches this works in a case like "aba" in "c abba c". Here is the matrix:

aba - c abba c: 1 @3x3
     #  c     a  b  b  a     c
     00 01 02 03 04 05 06 07 08
# 00  0  0  0  0  0  0  0  0  0
a 01  1  1  1  0  1  1  0  1  1
b 02  2  2  2  1  0  1  1  1  2
a 03  3  3  3  2  1  1  1  2  2

Yet, in other cases, e.g. "abas" in "ego sum abbas Cucaniensis", I fail to collect the min scores from the bottom row:

abas - ego sum abbas Cucaniensis: 1 @-2x15
     #  e  g  o     s  u  m     a  b  b  a  s     C  u  c  a  n  i  e  n  s  i  s
     00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
# 00  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
a 01  1  1  1  1  1  1  1  1  1  0  1  1  0  1  1  1  1  1  0  1  1  1  1  1  1  1
b 02  2  2  2  2  2  2  2  2  2  1  0  1  1  1  2  2  2  2  1  1  2  2  2  2  2  2
a 03  3  3  3  3  3  3  3  3  3  2  1  1  1  2  2  3  3  3  2  2  2  3  3  3  3  3
s 04  4  4  4  4  4  3  4  4  4  3  2  2  2  1  2  3  4  4  3  3  3  3  4  3  4  3

Here there is just a single score=1 in the bottom row. In the case of a perfect match (score=0) my code just takes the left N-characters (where N is the length of the needle) from the rightmost lowest score; but here I have scores greater than 0. Probably I've just misinterpreted the hints in the above post, as I'm new to the interals of this algorithm. Could anyone suggest the correct way of finding the needle's index and length in the haystack?

Naftis
  • 4,393
  • 7
  • 63
  • 91
  • 1
    What you've essentially done is ask us to debug your code for you, which is not what Stack Overflow is for. – Ian Kemp Aug 22 '20 at 22:02
  • Sorry if I gave this impression, but I'm asking about the algorithm. From the algorithm perspective, does finding the needle index equate to collecting all the minimum scores in the bottom row, stopping when the length of the needle is reached? Or I misunderstood the hint? I added the implementation only for the sake of completeness and eventually to help others. – Naftis Aug 22 '20 at 22:14

1 Answers1

0

You start at the best score in the bottom row: the 1 at (13,4)

Then you find look at the predecessor states and transitions that could have got you there:

  • (12,4) - not possible, because it has a higher difference
  • (13,3) - not possible, because it has a higher difference
  • (12,3) - same difference and the characters match, so this works

From (12,3) you follow the same procedure to get to (11,2) and then (10,1)

At (10,1) the letters don't match, so you couldn't have come from (9,0). You could use either (10,0) for the similar string "bas", or you could use (9,1) then (8,0) for the similar string "abbas", both with distance 1.

Matt Timmermans
  • 53,709
  • 3
  • 46
  • 87
  • Thanks Matt, this was just what I asked for! Now I can better grasp the ratio behind the backtracking. I've modified my code to reflect this kind of logic, but I'm not posting here for brevity. Should anyone be interested, I can create a GIT repo from my test. – Naftis Aug 23 '20 at 19:35