I wrote a file shown below (from the infamous Coursera course and beyond) and it has served me well. Not sure if something has changed, but it doesn't seem to work now and I have changed nothing.
The first thing that doesn't seem to work is the for loop to remove the special characters.
Next, when I treat it as a Plan Text Doc, the word cloud doesn't seem to want to work.
Finally, the tokenizer functions are producing the same chart, essentially the frequently used single words vs. the ngrams as programmed. Meaning each ngram is just producing the same chart, the most frequently used words vs. the ngram of 2,3,4 words, etc...
Not sure if package updates or R updates are causing this.
Any thoughts?
#Set working directory and read file
cname <- file.path( "c:/texts")
cname
dir(cname)
setwd("c:/texts")
library("RColorBrewer")
library("tm")
library("knitr")
library("devtools")
library("plyr")
library("ggplot2")
library("wordcloud")
library("rJava")
library("RWeka")
library("stringi")
library("XLConnect")
library("XLConnectJars")
df<- readWorksheetFromFile("uars.xlsx", sheet=1, startRow=1)
df1 <- df[df$Business %in% "FRAUD", ]
#Load the R package for text mining and then load your texts into R.
library(tm)
docs <- Corpus(VectorSource(df1))
summary(docs)
#read your documents in the R terminal using
inspect(docs)
#Preprocessing
#Removing punctuation
docs <- tm_map(docs, removePunctuation)
# remove special characters.
for(j in seq(docs))
{
docs[[j]] <- gsub("/", " ", docs[[j]])
docs[[j]] <- gsub("@", " ", docs[[j]])
docs[[j]] <- gsub("\\|", " ", docs[[j]])
}
#Removing numbers:
docs <- tm_map(docs, removeNumbers)
#Converting to lowercase:
docs <- tm_map(docs, tolower)
#Removing "stopwords" (common words) that usually have no analytic value
docs <- tm_map(docs, removeWords, c(stopwords("english"), "bank", "account", "customer", "transactions", "sent", "received", "company",
"wire", "wires", "payment", "payments", "wells", "fargo", "transaction", "fraud", "wholesale", "wholesal", "uar", "email"))
#Removing common word endings (e.g., "ing", "es", "s")
library(SnowballC)
docs <- tm_map(docs, stemDocument)
#Stripping unnecesary whitespace from your documents:
docs <- tm_map(docs, stripWhitespace)
#treat your preprocessed documents as text documents.
docs <- tm_map(docs, PlainTextDocument)
#Stage the Data
#To proceed, create a document term matrix
dtm <- DocumentTermMatrix(docs)
dtm
inspect(dtm)
#transpose of this matrix
tdm <- TermDocumentMatrix(docs)
tdm
##TCorpus <- tm_map(TCorpus, removeWords, badWords)
wordcloud(docs, scale=c(3,0.5), min.freq=5, max.words=100, random.order=TRUE,
rot.per=0.5, colors=brewer.pal(8, "Set1"), use.r.layout=FALSE)
#Tokenizer functions
bigram <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
quadgram <- function(x) NGramTokenizer(x, Weka_control(min=4, max=4))
fivegram <- function(x) NGramTokenizer (x, Weka_control(min=5, max=5))
sixgram <- function(x) NGramTokenizer (x, Weka_control(min=6, max=6))
#Word/phrase count function
freq_df <- function(tdm){
# Helper function to tabulate frequency
freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
freq_df <- data.frame(word=names(freq), freq=freq)
return(freq_df)
}
#Creating the n-grams
corpus.unigram <- TermDocumentMatrix(docs)
corpus.unigram <- removeSparseTerms(corpus.unigram, 0.99)
corpus.unigram.freq <- freq_df(corpus.unigram)
corpus.bigram <- TermDocumentMatrix(docs, control=list(tokenize=bigram))
corpus.bigram <- removeSparseTerms(corpus.bigram, 0.999)
corpus.bigram.freq <- freq_df(corpus.bigram)
corpus.trigram <- TermDocumentMatrix(docs, control=list(tokenize=trigram))
corpus.trigram <- removeSparseTerms(corpus.trigram, 0.99)
corpus.trigram.freq <- freq_df(corpus.trigram)
corpus.quadgram <- TermDocumentMatrix(docs, control=list(tokenize=quadgram))
corpus.quadgram <- removeSparseTerms(corpus.quadgram, 0.9999)
corpus.quadgram.freq <- freq_df(corpus.quadgram)
corpus.fivegram <- TermDocumentMatrix(docs, control=list(tokenize=fivegram))
corpus.fivegram <- removeSparseTerms(corpus.fivegram, 0.9999)
corpus.fivegram.freq <- freq_df(corpus.fivegram)
corpus.sixgram <- TermDocumentMatrix(docs, control=list(tokenize=sixgram))
corpus.sixgram <- removeSparseTerms(corpus.sixgram, 0.9999)
corpus.sixgram.freq <- freq_df(corpus.sixgram)
top_50 <- function(df1, title, color) {
ggplot(df[1:50,], aes(x = seq(1:50), y = freq)) +
geom_bar(stat = "identity", fill = color, colour = "black", width = 0.80) +
coord_cartesian(xlim = c(0, 51)) +
labs(title = title) +
xlab("Words") +
ylab("Count") +
scale_x_continuous(breaks = seq(1, 50, by = 1), labels = df$word[1:50]) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
}
top_50(corpus.unigram.freq,"Top 50 words","green")
top_50(corpus.bigram.freq,"Top 2 word combos","yellow")
top_50(corpus.trigram.freq,"Top 3 word combos","orange")
top_50(corpus.quadgram.freq,"Top 4 word combos","red")
top_50(corpus.fivegram.freq,"Top 5 word combos","blue")
top_50(corpus.sixgram.freq,"Top 6 word combos","purple")