Here is another approach based on the Arabic Unicode block:
const map = {
'آ': 'ا',
'أ': 'ا',
'إ': 'ا',
'ا': 'ا',
'ٱ': 'ا',
'ٲ': 'ا',
'ٳ': 'ا',
'ؤ': 'و',
'ئ': 'ى',
'ؽ': 'ؽ',
'ؾ': 'ؾ',
'ؿ': 'ؿ',
'ي': 'ى',
'ب': 'ب',
'ت': 'ت',
'ؠ': 'ؠ',
'ة': 'ه',
'ث': 'ث',
'ج': 'ج',
'ح': 'ح',
'خ': 'خ',
'د': 'د',
'ذ': 'ذ',
'ر': 'ر',
'ز': 'ز',
'س': 'س',
'ش': 'ش',
'ص': 'ص',
'ض': 'ض',
'ط': 'ط',
'ظ': 'ظ',
'ع': 'ع',
'غ': 'غ',
'ػ': 'ک',
'ؼ': 'ک',
'ف': 'ف',
'ق': 'ق',
'ك': 'ك',
'ګ': 'ك',
'ڬ': 'ك',
'ڭ': 'ڭ',
'ڮ': 'ك',
'ل': 'ل',
'م': 'م',
'ن': 'ن',
'ه': 'ه',
'و': 'و',
'ى': 'ى',
'ٸ': 'ى',
'ٵ': 'ءا', // hamza alef?
'ٶ': 'ءو', // hamza waw?
'ٹ': 'ٹ',
'ٺ': 'ٺ',
'ٻ': 'ٻ',
'ټ': 'ت',
'ٽ': 'ت',
'پ': 'پ',
'ٿ': 'ٿ',
'ڀ': 'ڀ',
'ځ': 'ءح',
'ڂ': 'ح',
'ڃ': 'ڃ',
'ڄ': 'ڄ',
'څ': 'ح',
'چ': 'چ',
'ڇ': 'ڇ',
'ڈ': 'ڈ',
'ډ': 'د',
'ڊ': 'د',
'ڋ': 'د',
'ڌ': 'ڌ',
'ڍ': 'ڍ',
'ڎ': 'ڎ',
'ڏ': 'د',
'ڐ': 'د',
'ڑ': 'ڑ',
'ڒ': 'ر',
'ړ': 'ر',
'ڔ': 'ر',
'ڕ': 'ر',
'ږ': 'ر',
'ڗ': 'ر',
'ژ': 'ژ',
'ڙ': 'ڙ',
'ښ': 'س',
'ڛ': 'س',
'ڜ': 'س',
'ڝ': 'ص',
'ڞ': 'ص',
'ڟ': 'ط',
'ڠ': 'ع',
'ڡ': 'ڡ',
'ڢ': 'ڡ',
'ڣ': 'ڡ',
'ڤ': 'ڤ',
'ڥ': 'ڡ',
'ڦ': 'ڦ',
'ڧ': 'ق',
'ڨ': 'ق',
'ک': 'ک',
'ڪ': 'ڪ',
'گ': 'گ',
'ڰ': 'گ',
'ڱ': 'ڱ',
'ڲ': 'گ',
'ڳ': 'ڳ',
'ڴ': 'گ',
'ڵ': 'ل',
'ڶ': 'ل',
'ڷ': 'ل',
'ڸ': 'ل',
'ڹ': 'ن',
'ں': 'ں',
'ڻ': 'ڻ',
'ڼ': 'ن',
'ڽ': 'ن',
'ھ': 'ه',
'ڿ': 'چ',
'ۀ': 'ه',
'ہ': 'ہ',
'ۂ': 'ءہ',
'ۃ': 'ہ',
'ۄ': 'و',
'ۅ': 'ۅ',
'ۆ': 'ۆ',
'ۇ': 'ۇ',
'ۈ': 'ۈ',
'ۉ': 'ۉ',
'ۊ': 'و',
'ۋ': 'ۋ',
'ی': 'ی',
'ۍ': 'ي',
'ێ': 'ي',
'ۏ': 'و',
'ې': 'ې',
'ۑ': 'ي',
'ے': 'ے',
'ۓ': 'ے',
'ە': 'ە',
'ۺ': 'ش',
'ۻ': 'ض',
'ۼ': 'ۼ',
'ۿ': 'ه'
}
function removeDiacritics(text) {
const symbols = [...text]
const result = []
for (const symbol of symbols) {
if (map[symbol]) {
result.push(symbol)
}
}
return result.join('')
}
Some letters could still be considered to have diacritics such as ژ
"jeh" which looks like ر
"reh". But since it is given a different fundamental name in Arabic, I made it not get stripped of its "extra markings" to become "reh". That happened in a few cases, such as with ڡ
"feh" and ڢ
"dot below feh", but ڤ
and ڦ
were given fundamental names, but not ڥ
for example. Not sure the best way to approach those. I don't know the exact definition of what is a diacritic and what is not to a 100% degree, but this should be a good start.
Also, the "hamza + letter" ligatures were converted into hamza and the letter separately.
If you know how to improve this, please comment and add a fix if you'd like.