0

I have several data frames structured like this:

set.seed(123)
dat1 <- data.frame(Region = rep(c("r1","r2"), each = 100),
                   State = rep(c("NY","MA","FL","GA"), each = 10),
                   Loc = rep(c("a","b","c","d","e","f","g","h"),each = 5),
                   ID = rep(c(1:10), each = 2),
                   var1 = rnorm(200),
                   var2 = rnorm(200),
                   var3 = rnorm(200),
                   var4 = rnorm(200),
                   var5 = rnorm(200))
dat1$ID <- factor(dat1$ID)

I am using the mclust package to fit mixture models. There are several things I need to do iteratively on each of these data frames, and I find myself feeling very unorganized (and with a full environment) when trying to do them multiple times. So I have been writing the function below to make things easier, and produce the output I need to see. Basically, I need to be able to specify the data (dat) the columns I want it to use (cols), the number of mixture components I want it to use (g), and the model name I want it to use (modname). The steps of what I am trying to accomplish with the function are as follows: - fit the model, make density and classification plots, - get bootstraped parameter estimates using non-paremetric and weighted likelihood resampling - compare the two bootstrap methods in the figure *use the non-parametric bootstrap for the remainder - make a data frame that combines the original data with a new column showing which cluster the observation was assigned to (return via knitr::kable) - collect the mean, bootstrapped standard error, and number of observations in each cluster; use them to make a plot that shows the average +/- SE for each variable connected by cluster, also return these values via kable - Return kables showing how many observations are in each cluster that correspond with each grouping variable

library(tidyverse)
library(mclust)
library(knitr)
fitclust <- function(dat, cols, g, modname){
  set.seed(123)
  mod <- Mclust(dat[,cols], G=g, modelNames = modname, initialization = list(hcPairs = randomPairs(dat[,cols], seed = 123)))
  plot(mod, what = "classification")
  plot(mod, what = "density")
  rand <- kable(data.frame(GV = c("Region", "State", "Loc"),RandIdx = rbind(adjustedRandIndex(dat$Region, mod$classification), adjustedRandIndex(dat$State, mod$classification),adjustedRandIndex(dat$Loc, mod$classification))))
  boot <- MclustBootstrap(mod, nboot = 2, type = "bs")#2 for now to speed it up
  wlboot <- MclustBootstrap(mod, nboot = 2, type = "wlbs")

  modinfo<- kable(lapply(c(mod$parameters[1:2], mod$parameters$variance[4:7]), data.frame))
  probs<-data.frame(mod$z)
  colnames(probs) <- paste0("C", 1:ncol(probs))
  probs <- kable(probs)

  boot.ci <- summary(boot, what = "ci") 
  wlboot.ci <- summary(wlboot, what = "ci")
  par(mfrow = c(1,2), mar = c(4,4,1,1)) 
for(j in 1:mod$G) { 
  print(plot(1:mod$G, mod$parameters$mean[j,], col = 1:mod$G, pch = 15, 
       ylab = colnames(dat[,cols])[j], 
       xlab = "Mixture component", 
       ylim = range(boot.ci$mean,wlboot.ci$mean), 
       xlim = c(.5,mod$G+.5), xaxt = "n")+ 
  points(1:mod$G+0.2, mod$parameters$mean[j,], col = 1:mod$G, pch = 15)+ 
  axis(side = 1, at = 1:mod$G)+ 
  with(boot.ci, errorBars(1:G, mean[1,j,], mean[2,j,], col = 1:G))+ 
  with(wlboot.ci, errorBars(1:G+0.2, mean[1,j,], mean[2,j,], col = 1:G, lty = 2))) } 
  par(mfrow = c(3,2)) 
  plot(boot, what = "pro") 
par(mfrow = c(1,1))

newdat <- cbind(dat, cluster = mod$classification)
a <-
  newdat%>%
  dplyr::select_if(is.numeric)%>%
  dplyr::group_by(cluster)%>%
  summarise_all(mean)%>%
  pivot_longer(-c("cluster"), names_to = "Vars", values_to = "mean")
b<-
as.data.frame(cbind(cluster=1:mod$G, t(summary(boot, what = "se")$mean)))%>%
   pivot_longer(., -c("cluster"), names_to = "Vars", values_to = "SE")
newdat<-dplyr::mutate(a, "SE" = b$SE)

ClustComp<-
ggplot(a, aes(x=Vars, y=mean, group=factor(cluster))) + 
  geom_line(aes(colour=factor(cluster)))+
  geom_point()+
  geom_errorbar(aes(ymin=mean-SE, ymax=mean+SE), width = .1)+
  ggtitle("Mean by cluster") +
    expand_limits(y=0) +                        # Expand y range
    scale_y_continuous(breaks=0:20*4) +         # Set tick every 4
    labs(x = "Var", y = "Cluster Average")+
    theme_bw() +
    theme(legend.justification=c(1,0),
          legend.position=c(1,0))               # Position legend in bottom right
RegCt<-
kable(newdat%>%
  dplyr::select(Region,cluster)%>%
  group_by(cluster,Region)%>%
  count())
StateCt<-
kable(newdat%>%
  dplyr::select(State,cluster)%>%
  group_by(cluster,State)%>%
  count())
LocCt<-
kable(newdat%>%
  dplyr::select(Loc,cluster)%>%
  group_by(cluster,Loc)%>%
  count())
newdat <- kable(newdat)
a <- kable(a)
output <- list(rand, modinfo, probs, newdat, a, RegCt, StateCt, LocCt, ClustComp)
return(output)
}

fitclust(dat= dat1, cols = 5:9, g=3, modname="EEV")

When running this, you will get an error that says Error: ColumnSEmust be length 35 (the number of rows) or one, not 30. Strangely, when trying this function with my real data, it works just fine with some combinations of columns(cols) and mixture components (g), although I haven't found a combination between that works in this reproduceable example. For instance, I have a data frame with 13 columns and 289 rows (columns 1:6 are factors, and columns 7:13 are numeric). Assuming that data frame is named dat1, the function works when I use fitclust(dat=dat1, cols = 7:13, g=5, modname="VEI"), but not when I use fitclust(dat=dat1, cols = 7:12, g=5, modname="VEI"). I believe that this issue has to do with the way I am transposing the matrices when the function creates a and b. How could I have written this differently to fix this bug? Also, when I extract the information for modinfo in the function, how could I retain the names associated with those list components (mod$parameters; pro, mean, sigma, scale, shape, and orientation), so that they are labeled in the output?

Ryan
  • 1,048
  • 7
  • 14
  • Your working and not working lines are exactly the same. Please advise and post actual error messages as *not working* is not specific. – Parfait Jun 15 '20 at 00:17
  • @Parfait apologies I fixed my typo and added the error message. – Ryan Jun 15 '20 at 12:08

0 Answers0