3

I am working on the LDA topic model in python which gives output of of the following topics:

(0, u'0.559*"delivery" + 0.124*"area" + 0.018*"mile" + 0.016*"option" + 0.012*"partner" + 0.011*"traffic" + 0.011*"hub" + 0.011*"thanks" + 0.010*"city" + 0.009*"way"')
(1, u'0.397*"package" + 0.073*"address" + 0.055*"time" + 0.047*"customer" + 0.045*"apartment" + 0.037*"delivery" + 0.031*"number" + 0.026*"item" + 0.021*"support" + 0.018*"door"')
(2, u'0.190*"time" + 0.127*"order" + 0.113*"minute" + 0.075*"pickup" + 0.074*"restaurant" + 0.031*"food" + 0.027*"support" + 0.027*"delivery" + 0.026*"pick" + 0.018*"min"')
(3, u'0.072*"code" + 0.067*"gps" + 0.053*"map" + 0.050*"street" + 0.047*"building" + 0.043*"address" + 0.042*"navigation" + 0.039*"access" + 0.035*"point" + 0.028*"gate"')
(4, u'0.434*"hour" + 0.068*"time" + 0.034*"min" + 0.032*"amount" + 0.024*"pay" + 0.019*"gas" + 0.018*"road" + 0.017*"today" + 0.016*"traffic" + 0.014*"load"')
(5, u'0.245*"route" + 0.154*"warehouse" + 0.043*"minute" + 0.039*"need" + 0.039*"today" + 0.026*"box" + 0.025*"facility" + 0.025*"bag" + 0.022*"end" + 0.020*"manager"')
(6, u'0.371*"location" + 0.110*"pick" + 0.097*"system" + 0.040*"im" + 0.038*"employee" + 0.022*"evening" + 0.018*"issue" + 0.015*"request" + 0.014*"while" + 0.013*"delivers"')
(7, u'0.182*"schedule" + 0.181*"please" + 0.059*"morning" + 0.050*"application" + 0.040*"payment" + 0.026*"change" + 0.025*"advance" + 0.025*"slot" + 0.020*"date" + 0.020*"tomorrow"')
(8, u'0.138*"stop" + 0.110*"work" + 0.062*"name" + 0.055*"account" + 0.046*"home" + 0.043*"guy" + 0.030*"address" + 0.026*"city" + 0.025*"everything" + 0.025*"feature"') 

Is there a way that the model automatically creates human readable topic names (instead of the topic numbers) for the above topic list based of the features/words in each topic? I don't want to create topic names manually for each of the 0-9 topics.

I'm creating the LDA model as follows:

import gensim
import json
import pandas as pd
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
import re
from nltk.tokenize import RegexpTokenizer
import pyLDAvis.gensim as gensimvis
import pyLDAvis
from gensim import corpora
from gensim import corpora, models, similarities 
import xlrd
from collections import OrderedDict
import simplejson as json

wb = xlrd.open_workbook("excel.xlsx")
sh = wb.sheet_by_index(0)

feedback_list = []

for rownum in range(1, sh.nrows):
    feedback = OrderedDict()
    row_values = sh.row_values(rownum)
    feedback['timestamp'] = row_values[0]
    feedback['text'] = row_values[1]
    feedback['header'] = row_values[2]
    feedback['transporter'] = row_values[3]
    feedback['device-type'] = row_values[4]
    feedback['app-version'] = row_values[5]
    feedback['locale'] = row_values[6]
    feedback['company-type'] = row_values[7]
    feedback['detected-language'] = row_values[8]  

    feedback_list.append(feedback)

j = json.dumps({'feedback': feedback_list})

with open('data.json', 'w') as f:
    f.write(j)  

data_file = "data.json"  
pd.read_json(data_file, typ = "series")


with open(data_file, "rb") as f:
    data = f.readlines()

# Reading the data to a dataframe

data_json_str = "["+','.join(data) + "]"
data_df = pd.read_json(data_json_str)

num_reviews_tpadv = len(data_df["feedback"][0])
all_reviews = []

# Adding all the reviews to all_reviews list 

for i in range(num_reviews_tpadv):
    all_reviews.append(data_df["feedback"][0][i]["text"])

stopwords = {}
with open('stopwords.txt', 'rU') as f:
    for line in f:
        stopwords[line.strip()] = 1

def clean_review(text):

    words = []
    nouns = []
    if type(text) != int:
        new_text = text.lower()
        sentences = nltk.sent_tokenize(new_text)

        for sentence in sentences:
            tokens = nltk.word_tokenize(sentence)
            text1 = [word for word in tokens if word not in stopwords]
            tagged_text = nltk.pos_tag(text1)

            for word, tag in tagged_text:
                words.append({"word": word, "pos": tag})       

        lem = WordNetLemmatizer()

        words = [word for word in words if word["pos"] in ["NN", "NNS"]]

        for word in words:
            nouns.append(lem.lemmatize(word["word"]))

    return nouns          

clean_reviews = []

for i in range(num_reviews_tpadv):
    clean_reviews.append(clean_review(data_df["feedback"][0][i]["text"]))

#----------------------------------------------------
# Creating Dictionary and Corpus to train LDA model
#----------------------------------------------------

dictionary = corpora.Dictionary(clean_reviews)
dictionary.filter_extremes(keep_n=11000) #change filters
dictionary.compactify()
dictionary_path = "dictionary.dict"
corpora.Dictionary.save(dictionary, dictionary_path)

corpus = [dictionary.doc2bow(doc) for doc in clean_reviews]

# Training lda using number of topics set = 10 (which can be changed)

lda = gensim.models.LdaModel(corpus, id2word = dictionary,
                        num_topics = 20,
                        passes = 20,
                        random_state=1,
                        alpha = "auto")
lda_model_path = "lda_model.lda"
lda.save(lda_model_path)

i = 0
for topic in lda.show_topics(20):
    print topic

    i += 1
Arman
  • 827
  • 3
  • 14
  • 28
  • Easiest way, IMHO is to identify the clusters from the topic models (might not be so trivial), then pick the centroid. If you're having getting clusters from LDA, fall back to KNN or gaussian models, then the centroid would be easier to get. It's a good question but it might be too broad for SO, try on https://datascience.stackexchange.com instead =) – alvas May 15 '17 at 03:15
  • Thanks I will try posting it there as well. Hoping to get some direction. :) – Arman May 15 '17 at 05:43
  • 1
    I found the following on github for automatic labeling of topic models: https://github.com/xiaohan2012/chowmein/tree/master/chowmein But I'm not able to understand it clearly with respect to my database. Could someone explain me? – Arman May 15 '17 at 06:18
  • @Arman did you find any way for automatic labeling? – user3778289 Aug 09 '18 at 08:40

0 Answers0