I'm playing with Levenshtein distance for getting a C# implementation which allows not only to tell whether two strings are similar, but also find a similar string (the needle) in a larger string (the haystack).
To this end, I tried to follow the advice at the bottom of this excellent post, but I'm getting some issues.
To start with, I adopted this implementation, changing it to fit my additional requirements. I also added some diagnostic dump support to let me understand the algorithm better, inspired by this other post.
My implementation returns an object with score and (when requested) index and length, and also a reference to the calculated matrix used for diagnostic purposes:
public class LevenshteinMatch
{
public int Score { get; }
public int Index { get; }
public int Length { get; }
public int[,] Matrix { get; set; }
public LevenshteinMatch(int score, int index = 0, int length = 0)
{
Score = score;
Index = index;
Length = length;
}
public override string ToString()
{
return $"{Score} @{Index}x{Length}";
}
}
Here is my implementation: the Distance
method works "normally" if sub
is false; otherwise, it finds a similar substring. DumpMatrix
is just a diagnostic helper method.
public static class Levenshtein
{
public static string DumpMatrix(int[,] d, string a, string b)
{
if (d == null) throw new ArgumentNullException(nameof(d));
if (a == null) throw new ArgumentNullException(nameof(a));
if (b == null) throw new ArgumentNullException(nameof(b));
// # k i t t e n
// 00 01 02 03 04 05 06
// # 00 .. .. .. .. .. .. ..
// s 01 .. .. .. .. .. .. ..
// ...etc (sitting)
StringBuilder sb = new StringBuilder();
int n = a.Length;
int m = b.Length;
// b-legend
sb.Append(" # ");
for (int j = 0; j < m; j++) sb.Append(b[j]).Append(" ");
sb.AppendLine();
sb.Append(" 00 ");
for (int j = 1; j < m; j++) sb.AppendFormat("{0:00}", j).Append(' ');
sb.AppendFormat("{0:00} ", m).AppendLine();
// matrix
for (int i = 0; i <= n; i++)
{
// a-legend
if (i == 0)
{
sb.Append("# 00 ");
}
else
{
sb.Append(a[i - 1])
.Append(' ')
.AppendFormat("{0:00}", i)
.Append(' ');
}
// row of values
for (int j = 0; j <= m; j++)
sb.AppendFormat("{0,2} ", d[i, j]);
sb.AppendLine();
}
return sb.ToString();
}
private static LevenshteinMatch BuildMatch(string a, string b, int[,] d)
{
int n = a.Length;
int m = b.Length;
// take the min rightmost score instead of the bottom-right corner
int min = 0, rightMinIndex = -1;
for (int j = m; j > -1; j--)
{
if (rightMinIndex == -1 || d[n, j] < min)
{
min = d[n, j];
rightMinIndex = j;
}
}
// corner case: perfect match, just collect m chars from score=0
if (min == 0)
{
return new LevenshteinMatch(min,
rightMinIndex - n,
n);
}
// collect all the lowest scores on the bottom row leftwards,
// up to the length of the needle
int count = n, leftMinIndex = rightMinIndex;
while (leftMinIndex > -1)
{
if (d[n, leftMinIndex] == min && --count == 0) break;
leftMinIndex--;
}
return new LevenshteinMatch(min,
leftMinIndex - 1,
rightMinIndex + 1 - leftMinIndex);
}
public static LevenshteinMatch Distance(string a, string b,
bool sub = false, bool withMatrix = false)
{
if (a is null) throw new ArgumentNullException(nameof(a));
if (b == null) throw new ArgumentNullException(nameof(b));
int n = a.Length;
int m = b.Length;
int[,] d = new int[n + 1, m + 1];
if (n == 0) return new LevenshteinMatch(m);
if (m == 0) return new LevenshteinMatch(n);
for (int i = 0; i <= n; i++) d[i, 0] = i;
// if matching substring, leave the top row to 0
if (!sub)
{
for (int j = 0; j <= m; j++) d[0, j] = j;
}
for (int j = 1; j <= m; j++)
{
for (int i = 1; i <= n; i++)
{
if (a[i - 1] == b[j - 1])
{
d[i, j] = d[i - 1, j - 1]; // no operation
}
else
{
d[i, j] = Math.Min(Math.Min(
d[i - 1, j] + 1, // a deletion
d[i, j - 1] + 1), // an insertion
d[i - 1, j - 1] + 1 // a substitution
);
}
}
}
LevenshteinMatch match = sub
? BuildMatch(a, b, d)
: new LevenshteinMatch(d[n, m]);
if (withMatrix) match.Matrix = d;
return match;
}
}
To be more complete, here is the demo console program using it. This just prompts the user for the matching mode (substring or not) and the two strings, then calls the Distance
method, dumps the resulting matrix, and shows the substring if required.
internal static class Program
{
private static string ReadLine(string defaultLine)
{
string s = Console.ReadLine();
return string.IsNullOrEmpty(s) ? defaultLine ?? s : s;
}
private static void Main()
{
Console.WriteLine("Fuzzy Levenshtein Matcher");
string a = "sitting", b = "kitten";
bool sub = false;
LevenshteinMatch match;
while (true)
{
Console.Write("sub [y/n]? ");
string yn = Console.ReadLine();
if (!string.IsNullOrEmpty(yn)) sub = yn == "y" || yn == "Y";
Console.Write(sub? $"needle ({a}): " : $"A ({a}): ");
a = ReadLine(a);
Console.Write(sub? $"haystack ({b}): " : $"B ({b}): ");
b = ReadLine(b);
match = Levenshtein.Distance(a, b, sub, true);
Console.WriteLine($"{a} - {b}: {match}");
Console.WriteLine(Levenshtein.DumpMatrix(match.Matrix, a, b));
if (sub) Console.WriteLine(b.Substring(match.Index, match.Length));
}
}
}
Now, for substring matches this works in a case like "aba" in "c abba c". Here is the matrix:
aba - c abba c: 1 @3x3
# c a b b a c
00 01 02 03 04 05 06 07 08
# 00 0 0 0 0 0 0 0 0 0
a 01 1 1 1 0 1 1 0 1 1
b 02 2 2 2 1 0 1 1 1 2
a 03 3 3 3 2 1 1 1 2 2
Yet, in other cases, e.g. "abas" in "ego sum abbas Cucaniensis", I fail to collect the min scores from the bottom row:
abas - ego sum abbas Cucaniensis: 1 @-2x15
# e g o s u m a b b a s C u c a n i e n s i s
00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
# 00 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
a 01 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1
b 02 2 2 2 2 2 2 2 2 2 1 0 1 1 1 2 2 2 2 1 1 2 2 2 2 2 2
a 03 3 3 3 3 3 3 3 3 3 2 1 1 1 2 2 3 3 3 2 2 2 3 3 3 3 3
s 04 4 4 4 4 4 3 4 4 4 3 2 2 2 1 2 3 4 4 3 3 3 3 4 3 4 3
Here there is just a single score=1 in the bottom row. In the case of a perfect match (score=0) my code just takes the left N-characters (where N is the length of the needle) from the rightmost lowest score; but here I have scores greater than 0. Probably I've just misinterpreted the hints in the above post, as I'm new to the interals of this algorithm. Could anyone suggest the correct way of finding the needle's index and length in the haystack?