I'm trying to analyse a corpus and was wondering if the NLTK Part of Speech (PoS) tagging supports all languages?
For example, if I download this averaged_perceptron_tagger
and use the tagger as follows, can the tagger be used for Turkish?
Does it still count verbs even if the corpus is Turkish? I have set the stop words to Turkish but I'm not sure if this also influences the PoS tagging.
from newspaper import Article, fulltext # pip install newspaper3k
import requests, string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
my_stopwords=set(stopwords.words('turkish'))
from collections import Counter
url = 'https://www.haber7.com/guncel/haber/3319295-erdoganin-dogal-gaz-mujdesinden-rahatsiz-oldular'
article = Article(url)
text = article.download().parse().text
clean_text = text.translate(str.maketrans('','', string.punctuation)).lower()
print(clean_text)
tokenized = nltk.word_tokenize(clean_text)
words=[]
for token in tokenized:
if token not in my_stopwords:
words.append(token)
def pos_taggin(tokens):
tags=nltk.pos_tag(tokens)
counts= Counter(tag for word, tag in tags)
return counts
pos = pos_taggin(tokenized)
print(pos)
# List of pos-tags
nltk.download('tagsets')
nltk.help.upenn_tagset()