I have a large datasets with tweets in different languages. I want to apply a preprocessing function just to the sentences that are in german.
import time
import re
import sys
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
def preprocess(sentence):
sentence=str(sentence)
sentence = sentence.lower()
sentence=sentence.replace('{html}',"")
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', sentence)
rem_url=re.sub(r'http\S+', '',cleantext)
rem_num = re.sub('[0-9]+', '', rem_url)
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(rem_num)
filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('german')]
return " ".join(filtered_words)
#run previous function
test['cleanText']=test.apply(lambda s:preprocess(s['text']) if s['lang'] == "de" else None)
When I try to run the code, I get the following error
KeyError: 'lang'