0

I have been trying to follow this example by Norbert Ryciak, whom I havent been able to get in touch with.

Since this article was written in 2014, some things in R have changed so I have been able to update some of those things in the code, but I got stuck in the last part.

Here is my Working code so far:

 library(tm)
 library(stringi)
 library(proxy)

 wiki <- "https://en.wikipedia.org/wiki/"

 titles <- c("Integral", "Riemann_integral", "Riemann-Stieltjes_integral",  "Derivative",
  "Limit_of_a_sequence", "Edvard_Munch", "Vincent_van_Gogh", "Jan_Matejko",
  "Lev_Tolstoj", "Franz_Kafka", "J._R._R._Tolkien")

 articles <- character(length(titles))

 for (i in 1:length(titles)) {
   articles[i] <- stri_flatten(readLines(stri_paste(wiki, titles[i])), col = " ")
  }

 docs <- Corpus(VectorSource(articles))

 docs[[1]]
 docs2 <- tm_map(docs, function(x) stri_replace_all_regex(x, "<.+?>", " "))
 docs3 <- tm_map(docs2, function(x) stri_replace_all_fixed(x, "\t", " "))
 docs4 <- tm_map(docs3, PlainTextDocument)
 docs5 <- tm_map(docs4, stripWhitespace)
 docs6 <- tm_map(docs5, removeWords, stopwords("english"))
 docs7 <- tm_map(docs6, removePunctuation)
 docs8 <- tm_map(docs7, content_transformer(tolower))
 docs8[[1]]

 docsTDM <- TermDocumentMatrix(docs8)
 docsTDM2 <- as.matrix(docsTDM)
 docsdissim <- dist(docsTDM2, method = "cosine")

But I havent been able to get pass this part:

 docsdissim2 <- as.matrix(docsdissim)
 rownames(docsdissim2) <- titles
 colnames(docsdissim2) <- titles
 docsdissim2
 h <- hclust(docsdissim, method = "ward.D")
 plot(h, labels = titles, sub = "")

I tried to run the "hclust" directly, and then I was able to Plot, but nothing readable came out of it.

This are the errors Im getting:

 rownames(docsdissim2) <- titles
 Error in `rownames<-`(`*tmp*`, value = c("Integral", "Riemann_integral",  : 
   length of 'dimnames' [1] not equal to array extent

Another:

 plot(h, labels = titles, sub = "")
 Error in graphics:::plotHclust(n1, merge, height, order(x$order), hang,  : 
   invalid dendrogram input

Is there anyone that could give me a hand to finish this example?

Best Regards,

tomcontr
  • 98
  • 9

1 Answers1

1

I was able to solve this problem thanks to Norbert Ryciak (the author of the tutorial).

Since he used an older version of "tm" (which was probably the latest at the time) it was not compatible with the one I used.

The solution was to replace "docsTDM <- TermDocumentMatrix(docs8)" with "docsTDM <- DocumentTermMatrix(docs8)".

So the final code:

 library(tm)
 library(stringi)
 library(proxy)

 wiki <- "https://en.wikipedia.org/wiki/"

 titles <- c("Integral", "Riemann_integral", "Riemann-Stieltjes_integral",  "Derivative",
  "Limit_of_a_sequence", "Edvard_Munch", "Vincent_van_Gogh", "Jan_Matejko",
  "Lev_Tolstoj", "Franz_Kafka", "J._R._R._Tolkien")

 articles <- character(length(titles))

 for (i in 1:length(titles)) {
   articles[i] <- stri_flatten(readLines(stri_paste(wiki, titles[i])), col =     " ")
  }

 docs <- Corpus(VectorSource(articles))

 docs[[1]]
 docs2 <- tm_map(docs, function(x) stri_replace_all_regex(x, "<.+?>", " "))
 docs3 <- tm_map(docs2, function(x) stri_replace_all_fixed(x, "\t", " "))
 docs4 <- tm_map(docs3, PlainTextDocument)
 docs5 <- tm_map(docs4, stripWhitespace)
 docs6 <- tm_map(docs5, removeWords, stopwords("english"))
 docs7 <- tm_map(docs6, removePunctuation)
 docs8 <- tm_map(docs7, content_transformer(tolower))
 docs8[[1]]

 docsTDM <- DocumentTermMatrix(docs8)
 docsTDM2 <- as.matrix(docsTDM)
 docsdissim <- dist(docsTDM2, method = "cosine")

 docsdissim2 <- as.matrix(docsdissim)
 rownames(docsdissim2) <- titles
 colnames(docsdissim2) <- titles
 docsdissim2
 h <- hclust(docsdissim, method = "ward")
 plot(h, labels = titles, sub = "")
tomcontr
  • 98
  • 9