8

In Arabic, a letter like "ا" (Alef) has many forms/variations:

(ا, أ, إ, آ)

also it's the same case with the letter ي, it could also be ى.

What I am trying to do is to get ALL the possible variations of a word with many أ and ي letters.

For example the word "أين" should have all these possible (non-correct in most cases) variations: أين, إين, اين, آين, أىن, إين, اىن, آىن ... etc.

Why? I am building a small text correction system that can handle syntax mistakes and replace faulty words with the correct ones.

I have been trying to do this in the most clean way possible, but I ended up with a 8 for/foreach loops just to handle the word "أ"

There must be a better more clean way to do this! Any thoughts?

Here is my code up to this point:

        $alefVariations = ['ا', 'إ', 'أ', 'آ'];
        $word = 'أيامنا';

        // Break into letters
        $wordLetters = preg_split('//u', $word, null, PREG_SPLIT_NO_EMPTY);
        $wordAlefLettersIndexes = [];

        // Get the أ letters
        for($letterIndex = 0; $letterIndex < count($wordLetters); $letterIndex++){
            if(in_array($wordLetters[$letterIndex], $alefVariations)){
                $wordAlefLettersIndexes[] = $letterIndex;
            }
        }

        $eachLetterVariations = [];
        foreach($wordAlefLettersIndexes as $alefLettersIndex){
            foreach($alefVariations as $alefVariation){
                $wordCopy = $wordLetters;
                $wordCopy[$alefLettersIndex] = $alefVariation;

                $eachLetterVariations[$alefLettersIndex][] = $wordCopy;
            }
        }

        $variations = [];
        foreach($wordAlefLettersIndexes as $alefLettersIndex){
            $alefWordVariations = $eachLetterVariations[$alefLettersIndex];

            foreach($wordAlefLettersIndexes as $alefLettersIndex_inner){
                if($alefLettersIndex == $alefLettersIndex_inner) continue;

                foreach($alefWordVariations as $alefWordVariation){
                    foreach($alefVariations as $alefVariation){
                        $alefWordVariationCopy = $alefWordVariation;
                        $alefWordVariationCopy[$alefLettersIndex_inner] = $alefVariation;

                        $variations[] = $alefWordVariationCopy;
                    }
                }
            }
        }

        $finalList = [];
        foreach($variations as $variation){
            $finalList[] = implode('', $variation);
        }

        return array_unique($finalList);
Dewan159
  • 2,984
  • 7
  • 39
  • 43
  • 2
    What your talking about is probably best achieved with machine learning/AI – Joseph_J May 07 '18 at 09:01
  • @Joseph_J ammm, really?! Could it be that complex? I am hoping that some guru will come and give me like a 10 lines solution or something :) – Dewan159 May 07 '18 at 09:07
  • Would it work to first reduce input to base characters and then build a result array by stepping through the simplified word with a recursive replacement, to generate all possibilities? The base character could be the replacement-matrix key. – Teson May 07 '18 at 09:11
  • 1
    Well, I think you answered the question yourself, it took you 8 loops for a simple word. I took your question to be that you were looking for a predictive text application. Languages are hard to code. All the worlds best are still trying to perfect it. – Joseph_J May 07 '18 at 09:12
  • @Teson I don't think this would work, If I understood you well. – Dewan159 May 07 '18 at 10:17
  • Could you remove the quotes around the letters in your OP as it makes them difficult to read, I'd do it but it seems to mess with the letters themselves. – Script47 May 08 '18 at 11:39

1 Answers1

2

I don't think this is the way to do autocorrect, but here's a generic solution for the problem you asked. It uses recursion and it's in javascript (I don't know php).

function solve(word, sameLetters, customIndices = []){
    var splitLetters = word.split('')
                .map((char, index) => { // check if the current letter is within any variation
                    if(customIndices.length == 0 || customIndices.includes(index)){
                        var variations = sameLetters.find(arr => arr.includes(char));
                        if(variations != undefined) return variations;
                    }
                    return [char];
                 });

    // up to this point splitLetters will be like this
    //  [["ا","إ","أ","آ"],["ي","ى","ي"],["ا"],["م"],["ن"],["ا"]]
    var res = [];
    recurse(splitLetters, 0, '', res); // this function will generate all the permuations
    return res;
}

function recurse(letters, index, cur, res){
    if(index == letters.length){
        res.push(cur);
    } else {
        for(var letter of letters[index]) {
            recurse(letters, index + 1, cur + letter, res );
        }
    }
}

var sameLetters = [     // represents the variations that you want to enumerate
    ['ا', 'إ', 'أ', 'آ'],
    ['ي', 'ى', 'ي']
];

var word = 'أيامنا';    
var customIndices = [0, 1]; // will make variations to the letters in these indices only. leave it empty for all indices

var ans = solve(word, sameLetters, customIndices);
console.log(ans);
Ayman El Temsahi
  • 2,600
  • 2
  • 17
  • 27