0

I have a static phrase the I am searching an OCR'd image for.

string KeywordToFind = "Account Number"

string OcrPageText = "
GEORGIA
POWER

A SOUTHERN COMPANY

AecountNumber

122- 493

Pagel of2

Please Pay By
Jan 29,2014

Total Due
39.11
"

How can I find the word "AecountNumber" using my keyword "Account Number"?

I have tried using variations of the Levenshtein Distance Algorithm HERE with varied success. I've also tried regexes, but the OCR often converts the text differently, thus rendering the regex useless.

Suggestions? I can provide more code if the link doesn't give enough information. Also, Thanks!

Community
  • 1
  • 1
Milne
  • 850
  • 1
  • 11
  • 28
  • Spilling over from your other question; ok now I can see why looking for 'account' and 'number' separately won't work for you. So can you post specifics about that case, ie. what your lev. algo gives are errors in that case? – Jim W Feb 07 '14 at 18:40

2 Answers2

0

Why not try something mostly arbitrary, like this -- while it would certainly match a lot more than just account number, the chances of the start and end characters existing elsewhere in that order is pretty slim.

A.?c.?.?nt ?N.?[mn]b.?r

http://regex101.com/r/zV1yM2

It'll match things like:

Account Number
AccntNumbr
Aecnt Nunber
brandonscript
  • 68,675
  • 32
  • 163
  • 220
  • What if the first or last character is the one that OCR recognized incorrectly? – Milne Feb 07 '14 at 17:32
  • That's the trouble with OCR. Too many unknowns. You're going to have to give it /some/ sort of expected criteria if you want it to work with a regular expression. Best thing to do is take some samples of the OCR and figure out which letters are most likely to be identified correctly (I'd guess the capital A and N?) and then the least likely should become `.?`. – brandonscript Feb 07 '14 at 17:46
0

Answered My Question with the use of sub-strings. Posting in case others run into the same type of problem. A little unorthodox, but it works great for me.

int TextLengthBuffer = (int)StaticTextLength - 1; //start looking for correct result with one less character than it should have.
int LowestLevenshteinNumber = 999999; //initialize insanely high maximum
decimal PossibleStringLength = (PossibleString.Length); //Length of string to search
decimal StaticTextLength = (StaticText.Length); //Length of text to search for
decimal NumberOfErrorsAllowed = Math.Round((StaticTextLength * (ErrorAllowance / 100)), MidpointRounding.AwayFromZero); //Find number of errors allowed with given ErrorAllowance percentage

    //Look for best match with 1 less character than it should have, then the correct amount of characters.
    //And last, with 1 more character. (This is because one letter can be recognized as 
    //two (W -> VV) and visa versa) 

for (int i = 0; i < 3; i++) 
{
    for (int e = TextLengthBuffer; e <= (int)PossibleStringLength; e++)
    {
        string possibleResult = (PossibleString.Substring((e - TextLengthBuffer), TextLengthBuffer));
        int lAllowance = (int)(Math.Round((possibleResult.Length - StaticTextLength) + (NumberOfErrorsAllowed), MidpointRounding.AwayFromZero));
        int lNumber = LevenshteinAlgorithm(StaticText, possibleResult);

        if (lNumber <= lAllowance && ((lNumber < LowestLevenshteinNumber) || (TextLengthBuffer == StaticText.Length && lNumber <= LowestLevenshteinNumber)))
        {
            PossibleResult = (new StaticTextResult { text = possibleResult, errors = lNumber });
            LowestLevenshteinNumber = lNumber;
        }
    }
    TextLengthBuffer++;
}




public static int LevenshteinAlgorithm(string s, string t) // Levenshtein Algorithm
{
    int n = s.Length;
    int m = t.Length;
    int[,] d = new int[n + 1, m + 1];

    if (n == 0)
    {
        return m;
    }

    if (m == 0)
    {
        return n;
    }

    for (int i = 0; i <= n; d[i, 0] = i++)
    {
    }

    for (int j = 0; j <= m; d[0, j] = j++)
    {
    }

    for (int i = 1; i <= n; i++)
    {
        for (int j = 1; j <= m; j++)
        {
            int cost = (t[j - 1] == s[i - 1]) ? 0 : 1;

            d[i, j] = Math.Min(
                Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1),
                d[i - 1, j - 1] + cost);
        }
    }
    return d[n, m];
}
Milne
  • 850
  • 1
  • 11
  • 28