1
library(dplyr)
library(ggplot2)
library(stm)
library(janeaustenr)
library(tidytext)

library(quanteda)
testDfm <- gadarian$open.ended.response %>%
    tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE)  %>%
    dfm()
    
out <- convert(testDfm, to = "stm")
documents <- out$documents
vocab <- out$vocab
meta <- out$meta

topic_model<- stm(documents = out$documents, vocab = out$vocab, K = 5)

Using these lines a topic modeling approach is possible

How is it possible to use tidytext in order to receive for every row of input data gadarian see every row linkedin to which topic, adding topics to input data?

Example of expected output

"MetaID" "treatment" "pid_rep"  "open.ended.response" "topic_number"

Update code as example of expected output:

library(stm)
library(tidyr)
library(quanteda)
testDfm <- gadarian$open.ended.response %>%
    tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE)  %>%
    dfm()
    
out <- convert(testDfm, to = "stm")
documents <- out$documents
vocab <- out$vocab
meta <- out$meta

fittedModel <- stm(documents = out$documents, vocab = out$vocab, K = 5)

documentMatches <- findThoughts(fittedModel, texts = gadarian$open.ended.response, n = 1)
docTopics <- sapply(1:nrow(gadarian), function(docIndex) { names(documentMatches$index[documentMatches$index == docIndex][1]) })
gadarian$topic <- docTopics
rek
  • 177
  • 7

1 Answers1

2
install.packages("reshape2")
library(reshape2)
td_beta <- tidy(fittedModel)
td_beta
td_beta %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  ggplot(aes(term, beta)) +
  geom_col() +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()
td_gamma <- tidy(fittedModel, matrix = "gamma",
                 document_names = rownames(gadarian))
td_gamma
Paolo Lorenzini
  • 579
  • 2
  • 15