0

What i tried to do is, create a fuzzy lookup algortihm that helps us to match these unique records with our mapping table by and shows us the percentage. I am applying Levenshtein algorithm to find the matches which is well known algorithm for fuzzy lookup. It is basically find the distance between two sources by inserting, deleting and swaping unmatched characters between two strings. But Levenshtein comes its own structure problem. For small length texts, Levenshtein works not so good. For example KLM and klm royal dutch airlines are same thing but because of unsufficient length of first text, Levenshtein matches KLM with Air Moldova. So, i figured it out, i need a logic to combine Levenshtein with Syllable match and substring match. I would be very happy if you guys help me. As below, you can see my code which is currently used at my addin.

' ------------------------------------‘
'Before here, i create arrays from ranges and all arrays are two dimensional arrays. Match and unmatched values are arr1 and arr2 at below code
    For m = LBound(arr1, 1) To UBound(arr1, 1)
        aresult = 0
        qnumber = 0
        For n = LBound(arr2, 1) To UBound(arr2, 1)
            qnumber = qnumber + 1
            a = Fuzzy(CStr(arr1(m, 1)), CStr(arr2(n, 1)))
            If a > aresult Then
                aresult = a
                qresult = qnumber
            End If
        Next n
        If aresult = 0 And qresult = 0 Then
            arr3(m, 1) = CVErr(xlErrNA)
            arr4(m, 1) = CVErr(xlErrNA)
        Else
            arr3(m, 1) = arr2(qresult, qnum)
            arr4(m, 1) = "%" & Round(aresult * 100, 0)
        End If
    Next m
Private Function Fuzzy(ByVal s1 As String, ByVal s2 As String) As Single
    Dim i As Integer, j As Integer, k As Integer, d1 As Integer, d2 As Integer, p As Integer
    Dim c As String, a1 As String, a2 As String, f As Single, o As Single, w As Single
    ' ******* INPUT STRINGS CLEANSING *******
    s1 = UCase(s1) 'input strings are converted to uppercase
    d1 = Len(s1)
    j = 1
    For i = 1 To d1
        c = Mid(s1, i, 1)
        Select Case c
            Case "0" To "9", "A" To "Z" 'filter the allowable characters
                a1 = a1 & c 'a1 is what remains from s1 after filtering
                 j = j + 1
        End Select
    Next
    If j = 1 Then Exit Function 'if s1 is empty after filtering
        d1 = j - 1
        s2 = UCase(s2)
        d2 = Len(s2)
        j = 1
        For i = 1 To d2
            c = Mid(s2, i, 1)
            Select Case c
                Case "0" To "9", "A" To "Z"
                    a2 = a2 & c
                    j = j + 1
            End Select
        Next
        If j = 1 Then Exit Function
        d2 = j - 1
        k = d1
        If d2 < d1 Then 'to prevent doubling the code below s1 must be made the shortest string, so we swap the variables
            k = d2
            d2 = d1
            d1 = k
            s1 = a2
            s2 = a1
            a1 = s1
            a2 = s2
        Else
            s1 = a1
            s2 = a2
        End If
        If k = 1 Then 'degenerate case, where the shortest string is just one character
            If InStr(1, s2, s1, vbBinaryCompare) > 0 Then
                Fuzzy = 1 / d2
            Else
                Fuzzy = 0
            End If
        Else '******* MAIN LOGIC HERE *******
            i = 1
            f = 0
            o = 0
            Do 'count the identical characters in s1 and s2 ("frequency analysis")
                p = InStr(1, s2, Mid(s1, i, 1), vbBinaryCompare) 'search the character at position i from s1 in s2
                If p > 0 Then 'found a matching character, at position p in s2
                    f = f + 1 'increment the frequency counter
                    s2 = Left(s2, p - 1) & "~" & Mid(s2, p + 1)
                    Do
                        If i >= k Then Exit Do 'no more characters to search
                            If Mid(s2, p + 1, 1) = Mid(s1, i + 1, 1) Then 'test if the next character is the same in the two strings
                                f = f + 1 'increment the frequency counter
                                o = o + 1 'increment the order counter
                                i = i + 1
                                p = p + 1
                            Else
                                Exit Do
                            End If
                    Loop
                End If
                If i >= k Then Exit Do
                    i = i + 1
            Loop
            If o > 0 Then o = o + 1
finish:
            w = 2
            Fuzzy = (w * o + f) / (w + 1) / d2
        End If

End Function
Damian
  • 5,152
  • 1
  • 10
  • 21
  • 1
    Welcome to the site! Does the code work but need improvement, or does it have a bug that prevents it from running? Or is there some other problem? – cxw May 25 '19 at 12:03
  • The code works but not the way i want. It will calculate the Levenshtein distance but it is not enough for short texts so i need to combine the algorithm with some kind of syllable and substring lookup logic – MehmetCanbulat May 26 '19 at 13:07

0 Answers0