0

Consider this data frame:

set.seed(123)
dat1 <- data.frame(Region = rep(c("r1","r2"), each = 100),
                   State = rep(c("NY","MA","FL","GA"), each = 10),
                   Loc = rep(c("a","b","c","d","e","f","g","h"),each = 5),
                   ID = rep(c(1:10), each = 2),
                   var1 = rnorm(200),
                   var2 = rnorm(200),
                   var3 = rnorm(200),
                   var4 = rnorm(200),
                   var5 = rnorm(200),
                   var6 = rnorm(200))
dat1$ID <- factor(dat1$ID)

I am using the mclust package to fit a mixture model and get bootstrapped standard errors as shown here:

library(tidyverse)
library(mclust)
set.seed(123)
mod <- Mclust(dat1[,5:10], G=5, modelNames = "VEI", initialization = list(hcPairs = randomPairs(dat1[,5:10], seed = 123)))
plot(mod, what = "classification")
plot(mod, what = "density")
boot <- MclustBootstrap(mod, nboot = 25, type = "bs")#25 for now to speed up

Next I want to create a new data frame that shows the mixture probabilities for each observation, so I combine the original dat1 with mod$z.

probs <- mod$z
colnames(probs) <- paste0("Prob", 1:mod$G)
probs <- cbind(dat1, probs)
probs <- cbind(probs, cluster = mod$classification)

Now I want collect: 1) the average value of each variable within each cluster, and 2) the standard error for each of those averages (which comes from summary(boot, what = "SE")$mean). I will use these to create the plot below, and return a table that shows the average and SE values.

newdat <- cbind(dat1, cluster = mod$classification)
a <-
  newdat%>%
  dplyr::select_if(is.numeric)%>%
  dplyr::group_by(cluster)%>%
  summarise_all(mean)%>%
  pivot_longer(-c("cluster"), names_to = "Vars", values_to = "mean")
b<-
as.data.frame(cbind(cluster=1:mod$G, t(summary(boot, what = "se")$mean)))%>%
   pivot_longer(., -c("cluster"), names_to = "Vars", values_to = "SE")
a<-dplyr::mutate(a, "SE" = b$SE)

ggplot(a, aes(x=Vars, y=mean, group=factor(cluster))) + 
  geom_line(aes(colour=factor(cluster)))+
  geom_point()+
  geom_errorbar(aes(ymin=mean-SE, ymax=mean+SE), width = .1)+
  ggtitle("Mean by cluster") +
    expand_limits(y=0) +                        
    scale_y_continuous(breaks=0:20*4) +         
    labs(x = "Var", y = "Cluster Average")+
    theme_bw() +
    theme(legend.justification=c(1,0),
          legend.position=c(1,0))
library(knitr)
kable(a)             

Ultimately I want to write a function that will perform each of these steps and return the output at once. It will be applied to data frames structured just like dat1, except they will have different numbers of var columns. I will also need to specify how many clusters to use and which model to use when mod is created. What is a better way for me to collect the information that is stored in a so that the same process can be applied to any combination of variables (var. columns) and mixture components (G specified in mod) from within a function?

Ryan
  • 1,048
  • 7
  • 14
  • 1
    Have you looked at the broom and tidymodels packages? I'm not totally clear on what you want to do / what part of the code needs to change, but those might help – camille Jun 15 '20 at 14:12
  • @camille I tried to provide the background of what I was doing and why, but maybe this makes my specific question less obvious. I will attempt to clarify. My question here concerns methods of extracting the means and standard errors from the objects `mod` and `boot` (the creation of object `a`). I essentially need a different way to create `a` without creating `b` because the matrix transposition used to create `b` is problematic with some combinations of variables and mixture components. – Ryan Jun 15 '20 at 14:37
  • FYI the tidymodels package may be a good route. I used `a<- data.frame(cbind(tidy(mod), t(summary(boot, what = "se")$mean)))` to get the information I need, now I just need to get it in the same format as `a` above – Ryan Jun 15 '20 at 14:53

1 Answers1

0

Maybe you can do something along these lines:

set.seed(123)
dat1 <- data.frame(Region = rep(c("r1","r2"), each = 100),
                   State = rep(c("NY","MA","FL","GA"), each = 10),
                   Loc = rep(c("a","b","c","d","e","f","g","h"),each = 5),
                   ID = rep(c(1:10), each = 2),
                   var1 = rnorm(200),
                   var2 = rnorm(200),
                   var3 = rnorm(200),
                   var4 = rnorm(200),
                   var5 = rnorm(200),
                   var6 = rnorm(200))
dat1$ID <- factor(dat1$ID)
library(ggplot2)
library(data.table)
library(mclust)
#> Package 'mclust' version 5.4.6
#> Type 'citation("mclust")' for citing this R package in publications.

clusterVars <- function(dat, col_idx, G=5, seed=123){
    mod <- Mclust(dat[, col_idx], G=G, modelNames = "VEI", 
        initialization = list(hcPairs = randomPairs(dat[, col_idx], seed = seed)))
    plot(mod, what = "classification")
    plot(mod, what = "density")
    boot <- MclustBootstrap(mod, nboot = 25, type = "bs")#25 for now to speed up
    mydat <- data.table(dat, mod$z, cluster = mod$classification)
    setnames(mydat, old= paste0("V", seq_len(G)), new=paste0("Prob", seq_len(G)))
    a <- mydat[,(mean = lapply(.SD, mean)), by = c("cluster"), 
                .SDcols=colnames(dat)[col_idx]]
    a <- setkey(melt(
        a, id.vars = "cluster", variable.name = "Vars", value.name = "mean"), 
        cluster, Vars)
    b <- setkey(melt(
        data.table(cluster = seq_len(G), t(summary(boot, what = "se")$mean)), 
        id.vars = "cluster", variable.name = "Vars", value.name = "SE"),
        cluster, Vars)
    out <- a[b]
    p <- ggplot(out, aes(x=Vars, y=mean, group=factor(cluster))) + 
        geom_line(aes(colour=factor(cluster)))+
        geom_point()+
        geom_errorbar(aes(ymin=mean-SE, ymax=mean+SE), width = .1)+
        ggtitle("Mean by cluster") +
        expand_limits(y=0) +                        
        scale_y_continuous(breaks=0:20*4) +         
        labs(x = "Var", y = "Cluster Average")+
        theme_bw() +
        theme(legend.justification=c(1,0),
              legend.position=c(1,0))
    list(plot = p, table = out)       
}

out <- clusterVars(dat1, col_idx = 5:10, G = 5)

plot(out$plot)

out$table
#>     cluster Vars         mean        SE
#>  1:       1 var1 -0.239846636 0.2286331
#>  2:       1 var2  0.496303497 0.4237945
#>  3:       1 var3  0.045489269 0.2392820
#>  4:       1 var4  0.035899698 0.3157938
#>  5:       1 var5  0.852212286 0.4683157
#>  6:       1 var6 -0.652884807 0.3000536
#>  7:       2 var1 -0.177353942 0.4626206
#>  8:       2 var2 -0.155370426 0.4879140
#>  9:       2 var3  0.760466010 0.4773900
#> 10:       2 var4  0.000245426 0.5210745
#> 11:       2 var5 -0.534355806 0.3960204
#> 12:       2 var6  0.023759375 0.3364228
#> 13:       3 var1  2.308773892 0.7389746
#> 14:       3 var2 -0.095760375 0.6652415
#> 15:       3 var3  1.538477219 0.6248455
#> 16:       3 var4  0.423748712 0.9034914
#> 17:       3 var5  0.526404192 0.4475889
#> 18:       3 var6  0.147645599 0.4308516
#> 19:       4 var1  0.223028905 0.5078972
#> 20:       4 var2 -0.028866278 0.2597171
#> 21:       4 var3 -1.005246422 0.5143203
#> 22:       4 var4 -0.084080497 0.5511872
#> 23:       4 var5 -0.194968718 0.3312450
#> 24:       4 var6  0.255504356 0.3616244
#> 25:       5 var1  0.023540106 0.4559344
#> 26:       5 var2 -0.572302120 0.5058384
#> 27:       5 var3  0.782263291 0.3809646
#> 28:       5 var4 -0.202702860 0.3913615
#> 29:       5 var5  0.061927990 1.0652221
#> 30:       5 var6  2.019626078 0.5999117
#>     cluster Vars         mean        SE

Created on 2020-06-15 by the reprex package (v0.3.0)

user12728748
  • 8,106
  • 2
  • 9
  • 14