What i tried to do is, create a fuzzy lookup algortihm that helps us to match these unique records with our mapping table by and shows us the percentage. I am applying Levenshtein algorithm to find the matches which is well known algorithm for fuzzy lookup. It is basically find the distance between two sources by inserting, deleting and swaping unmatched characters between two strings. But Levenshtein comes its own structure problem. For small length texts, Levenshtein works not so good. For example KLM and klm royal dutch airlines are same thing but because of unsufficient length of first text, Levenshtein matches KLM with Air Moldova. So, i figured it out, i need a logic to combine Levenshtein with Syllable match and substring match. I would be very happy if you guys help me. As below, you can see my code which is currently used at my addin.
' ------------------------------------‘
'Before here, i create arrays from ranges and all arrays are two dimensional arrays. Match and unmatched values are arr1 and arr2 at below code
For m = LBound(arr1, 1) To UBound(arr1, 1)
aresult = 0
qnumber = 0
For n = LBound(arr2, 1) To UBound(arr2, 1)
qnumber = qnumber + 1
a = Fuzzy(CStr(arr1(m, 1)), CStr(arr2(n, 1)))
If a > aresult Then
aresult = a
qresult = qnumber
End If
Next n
If aresult = 0 And qresult = 0 Then
arr3(m, 1) = CVErr(xlErrNA)
arr4(m, 1) = CVErr(xlErrNA)
Else
arr3(m, 1) = arr2(qresult, qnum)
arr4(m, 1) = "%" & Round(aresult * 100, 0)
End If
Next m
Private Function Fuzzy(ByVal s1 As String, ByVal s2 As String) As Single
Dim i As Integer, j As Integer, k As Integer, d1 As Integer, d2 As Integer, p As Integer
Dim c As String, a1 As String, a2 As String, f As Single, o As Single, w As Single
' ******* INPUT STRINGS CLEANSING *******
s1 = UCase(s1) 'input strings are converted to uppercase
d1 = Len(s1)
j = 1
For i = 1 To d1
c = Mid(s1, i, 1)
Select Case c
Case "0" To "9", "A" To "Z" 'filter the allowable characters
a1 = a1 & c 'a1 is what remains from s1 after filtering
j = j + 1
End Select
Next
If j = 1 Then Exit Function 'if s1 is empty after filtering
d1 = j - 1
s2 = UCase(s2)
d2 = Len(s2)
j = 1
For i = 1 To d2
c = Mid(s2, i, 1)
Select Case c
Case "0" To "9", "A" To "Z"
a2 = a2 & c
j = j + 1
End Select
Next
If j = 1 Then Exit Function
d2 = j - 1
k = d1
If d2 < d1 Then 'to prevent doubling the code below s1 must be made the shortest string, so we swap the variables
k = d2
d2 = d1
d1 = k
s1 = a2
s2 = a1
a1 = s1
a2 = s2
Else
s1 = a1
s2 = a2
End If
If k = 1 Then 'degenerate case, where the shortest string is just one character
If InStr(1, s2, s1, vbBinaryCompare) > 0 Then
Fuzzy = 1 / d2
Else
Fuzzy = 0
End If
Else '******* MAIN LOGIC HERE *******
i = 1
f = 0
o = 0
Do 'count the identical characters in s1 and s2 ("frequency analysis")
p = InStr(1, s2, Mid(s1, i, 1), vbBinaryCompare) 'search the character at position i from s1 in s2
If p > 0 Then 'found a matching character, at position p in s2
f = f + 1 'increment the frequency counter
s2 = Left(s2, p - 1) & "~" & Mid(s2, p + 1)
Do
If i >= k Then Exit Do 'no more characters to search
If Mid(s2, p + 1, 1) = Mid(s1, i + 1, 1) Then 'test if the next character is the same in the two strings
f = f + 1 'increment the frequency counter
o = o + 1 'increment the order counter
i = i + 1
p = p + 1
Else
Exit Do
End If
Loop
End If
If i >= k Then Exit Do
i = i + 1
Loop
If o > 0 Then o = o + 1
finish:
w = 2
Fuzzy = (w * o + f) / (w + 1) / d2
End If
End Function