I am working on the LDA topic model in python which gives output of of the following topics:
(0, u'0.559*"delivery" + 0.124*"area" + 0.018*"mile" + 0.016*"option" + 0.012*"partner" + 0.011*"traffic" + 0.011*"hub" + 0.011*"thanks" + 0.010*"city" + 0.009*"way"')
(1, u'0.397*"package" + 0.073*"address" + 0.055*"time" + 0.047*"customer" + 0.045*"apartment" + 0.037*"delivery" + 0.031*"number" + 0.026*"item" + 0.021*"support" + 0.018*"door"')
(2, u'0.190*"time" + 0.127*"order" + 0.113*"minute" + 0.075*"pickup" + 0.074*"restaurant" + 0.031*"food" + 0.027*"support" + 0.027*"delivery" + 0.026*"pick" + 0.018*"min"')
(3, u'0.072*"code" + 0.067*"gps" + 0.053*"map" + 0.050*"street" + 0.047*"building" + 0.043*"address" + 0.042*"navigation" + 0.039*"access" + 0.035*"point" + 0.028*"gate"')
(4, u'0.434*"hour" + 0.068*"time" + 0.034*"min" + 0.032*"amount" + 0.024*"pay" + 0.019*"gas" + 0.018*"road" + 0.017*"today" + 0.016*"traffic" + 0.014*"load"')
(5, u'0.245*"route" + 0.154*"warehouse" + 0.043*"minute" + 0.039*"need" + 0.039*"today" + 0.026*"box" + 0.025*"facility" + 0.025*"bag" + 0.022*"end" + 0.020*"manager"')
(6, u'0.371*"location" + 0.110*"pick" + 0.097*"system" + 0.040*"im" + 0.038*"employee" + 0.022*"evening" + 0.018*"issue" + 0.015*"request" + 0.014*"while" + 0.013*"delivers"')
(7, u'0.182*"schedule" + 0.181*"please" + 0.059*"morning" + 0.050*"application" + 0.040*"payment" + 0.026*"change" + 0.025*"advance" + 0.025*"slot" + 0.020*"date" + 0.020*"tomorrow"')
(8, u'0.138*"stop" + 0.110*"work" + 0.062*"name" + 0.055*"account" + 0.046*"home" + 0.043*"guy" + 0.030*"address" + 0.026*"city" + 0.025*"everything" + 0.025*"feature"')
Is there a way that the model automatically creates human readable topic names (instead of the topic numbers) for the above topic list based of the features/words in each topic? I don't want to create topic names manually for each of the 0-9 topics.
I'm creating the LDA model as follows:
import gensim
import json
import pandas as pd
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
import re
from nltk.tokenize import RegexpTokenizer
import pyLDAvis.gensim as gensimvis
import pyLDAvis
from gensim import corpora
from gensim import corpora, models, similarities
import xlrd
from collections import OrderedDict
import simplejson as json
wb = xlrd.open_workbook("excel.xlsx")
sh = wb.sheet_by_index(0)
feedback_list = []
for rownum in range(1, sh.nrows):
feedback = OrderedDict()
row_values = sh.row_values(rownum)
feedback['timestamp'] = row_values[0]
feedback['text'] = row_values[1]
feedback['header'] = row_values[2]
feedback['transporter'] = row_values[3]
feedback['device-type'] = row_values[4]
feedback['app-version'] = row_values[5]
feedback['locale'] = row_values[6]
feedback['company-type'] = row_values[7]
feedback['detected-language'] = row_values[8]
feedback_list.append(feedback)
j = json.dumps({'feedback': feedback_list})
with open('data.json', 'w') as f:
f.write(j)
data_file = "data.json"
pd.read_json(data_file, typ = "series")
with open(data_file, "rb") as f:
data = f.readlines()
# Reading the data to a dataframe
data_json_str = "["+','.join(data) + "]"
data_df = pd.read_json(data_json_str)
num_reviews_tpadv = len(data_df["feedback"][0])
all_reviews = []
# Adding all the reviews to all_reviews list
for i in range(num_reviews_tpadv):
all_reviews.append(data_df["feedback"][0][i]["text"])
stopwords = {}
with open('stopwords.txt', 'rU') as f:
for line in f:
stopwords[line.strip()] = 1
def clean_review(text):
words = []
nouns = []
if type(text) != int:
new_text = text.lower()
sentences = nltk.sent_tokenize(new_text)
for sentence in sentences:
tokens = nltk.word_tokenize(sentence)
text1 = [word for word in tokens if word not in stopwords]
tagged_text = nltk.pos_tag(text1)
for word, tag in tagged_text:
words.append({"word": word, "pos": tag})
lem = WordNetLemmatizer()
words = [word for word in words if word["pos"] in ["NN", "NNS"]]
for word in words:
nouns.append(lem.lemmatize(word["word"]))
return nouns
clean_reviews = []
for i in range(num_reviews_tpadv):
clean_reviews.append(clean_review(data_df["feedback"][0][i]["text"]))
#----------------------------------------------------
# Creating Dictionary and Corpus to train LDA model
#----------------------------------------------------
dictionary = corpora.Dictionary(clean_reviews)
dictionary.filter_extremes(keep_n=11000) #change filters
dictionary.compactify()
dictionary_path = "dictionary.dict"
corpora.Dictionary.save(dictionary, dictionary_path)
corpus = [dictionary.doc2bow(doc) for doc in clean_reviews]
# Training lda using number of topics set = 10 (which can be changed)
lda = gensim.models.LdaModel(corpus, id2word = dictionary,
num_topics = 20,
passes = 20,
random_state=1,
alpha = "auto")
lda_model_path = "lda_model.lda"
lda.save(lda_model_path)
i = 0
for topic in lda.show_topics(20):
print topic
i += 1