I am preprocessing a list of words from a file. I'm struggling to remove accents because the Unicode Normalizer works on strings only. I am getting the following error :
TypeError: normalize() argument 2 must be str, not list
Any way to remove accents from the entire list ?
Many thanks
import string
import nltk
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer
from nltk.corpus import stopwords
stopwords = stopwords.words('french')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
french_stopwords = nltk.corpus.stopwords.words('french')
from unicodedata import normalize
lemmatizer = FrenchLefffLemmatizer()
def preprocessing(affaires):
preprocess_list = []
for sentence in affaires :
sentence_w_punct = "".join([i.lower() for i in sentence if i not in string.punctuation])
tokenize_sentence = nltk.tokenize.word_tokenize(sentence_w_punct)
words_w_stopwords = [i for i in tokenize_sentence if i not in french_stopwords]
no_accent = ''.join(c for c in unicodedata.normalize('NFD', words_w_stopwords)
if unicodedata.category(c) != 'Mn')
remove_parasites = [j for j in no_accent if j not in parasites]
words_lemmatize = (lemmatizer.lemmatize(w) for w in remove_parasites)
sentence_clean = ' '.join(words_lemmatize)
preprocess_list.append(sentence_clean)
return preprocess_list
df["nom_affaire_clean"] = preprocessing(df["nom_affaire"])
cln = df.pop("nom_affaire_clean")
df.insert(1, 'nom_affaire_clean', cln )
df