I am trying to build a document term matrix using preidentifed terms. The corpus is identified in the variable cname and the file with the preidentified terms are read into the terms variable which is then converted into a list. When I run the code below I get an empty DTM. The code I am using below. Any ideas on what I did wrong? Thank you!!!
tom
library(tm)
library(Rmpfr)
library(stm)
#Loading Documents
cname <- file.path("", "corpus", "goodsmoklss")
library(tm)
corp <- VCorpus(DirSource(cname))
#Transformations
docs<-tm_map(corp,tolower) #AllLowerCase
docs<-tm_map(corp,removeNumbers) #RemoveNumbers
#Remove Stopwords like is, was, the etc
docs<-tm_map(corp, removeWords, stopwords("english"))
#make Sure it is a PLainTextDocument
documents<-tm_map(docs,PlainTextDocument)
#read in list of preidentified terms
terms=read.delim("C:/corpus/TermList.csv", header=F, stringsAsFactor=F)
tokenizing.phrases <- c(terms)
library("RWeka")
phraseTokenizer <- function(x) {
require(stringr)
x <- as.character(x) # extract the plain text from TextDocument object
x <- str_trim(x)
if (is.na(x)) return("")
phrase.hits <- str_detect(x, coll(tokenizing.phrases))
if (any(phrase.hits)) {
# only split once on the first hit, so we don't have to worry about #multiple occurences of the same phrase
split.phrase <- tokenizing.phrases[which(phrase.hits)[1]]
#warning(paste("split phrase:", split.phrase))
temp <- unlist(str_split(x, coll(split.phrase), 2))
out <- c(phraseTokenizer(temp[1]), split.phrase, phraseTokenizer(temp[2]))
} else {
#out <- MC_tokenizer(x)
out <- " "
}
# get rid of any extraneous empty strings, which can happen if a phrase occurs just before a punctuation
out[out != ""]
}
dtm <- DocumentTermMatrix(documents, control = list(tokenize = phraseTokenizer))