Getting the root of an Arabic word

Question

I have a Python code that take an Arabic word and get the root and also remove diacritics, but I have a problem with the output. For example: when the input is "العربيه" the output is:"عرب" but when the input is "كاتب" the output is:"ب", and when the input is "يخاف" the output is " خف".

This is my code:

# -*- coding=utf-8 -*-

import re
from arabic_const import *
import Tashaphyne
from Tashaphyne import *
import enum
from enum import Enum
search_type=Enum('unvoc_word','voc_word','root_word')

HARAKAT_pat = re.compile(ur"[" + u"".join([FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA]) + u"]")
HAMZAT_pat = re.compile(ur"[" + u"".join([WAW_HAMZA, YEH_HAMZA]) + u"]");
ALEFAT_pat = re.compile(ur"[" + u"".join([ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW, HAMZA_ABOVE, HAMZA_BELOW]) + u"]");
LAMALEFAT_pat = re.compile(ur"[" + u"".join([LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE]) + u"]");
#--------------------------------------
def strip_tashkeel(w):
        "strip vowel from a word and return a result word"
        return HARAKAT_pat.sub('', w)

#strip tatweel from a word and return a result word
#--------------------------------------
def strip_tatweel(w):
        "strip tatweel from a word and return a result word"
        return re.sub(ur'[%s]' % TATWEEL,       '', w)


#--------------------------------------
def normalize_hamza(w):
        "strip vowel from a word and return a result word"
        w = ALEFAT_pat.sub(ALEF, w)
        return HAMZAT_pat.sub(HAMZA, w)

#--------------------------------------
def normalize_lamalef(w):
        "strip vowel from a word and return a result word"
        return LAMALEFAT_pat.sub(u'%s%s' % (LAM, ALEF), w)

#--------------------------------------
def normalize_spellerrors(w):
        "strip vowel from a word and return a result word"
        w = re.sub(ur'[%s]' % TEH_MARBUTA,      HEH, w)
        return re.sub(ur'[%s]' % ALEF_MAKSURA,  YEH, w)


def normalize_text(word,searchtype):
        word = strip_tashkeel(word)
        word = strip_tatweel(word)
        word = normalize_lamalef(word)
        word = normalize_hamza(word)
        word = normalize_spellerrors(word)
        if searchtype==search_type.root_word.index:
           ArListem=ArabicLightStemmer();
           stem=ArListem.lightStm(word);
           word=ArListem.get_root();
        print word
        return word
#---------------------------------------------

and this is the test code:

**from task import normalize_text
normalize_text(u'كاتب',2)

and the output is: ب

FYI, for someone who can't read Arabic this question is not clear. You could try to explain better, or hope someone comes by who can read it :) — Thomas, Mar 01 '13 at 23:17
Maybe you'll find more help if you tag your question with "nlp" and "stemming". — Suzana, Mar 01 '13 at 23:29
Have you tried http://nltk.googlecode.com/svn/trunk/doc/api/nltk.stem.isri.ISRIStemmer-class.html? Might get you better results. — Danica, Mar 01 '13 at 23:30
I suggest you print `word` after each line in `normalize_text` to see what is destroying the input. It could be the `ArabicLightStemmer` object which we don't have the code to. — Omri Barel, Mar 01 '13 at 23:39
Why are you doing `import Tashaphyne` and then `from Tashaphyne import *`? You can simply do `from Tashaphyne import *` without the first one... It's just excessive. — Rushy Panchal, Mar 01 '13 at 23:41

Getting the root of an Arabic word

0 Answers0