1

I am creating a summary statistics table about health survey data from a sample across 4 different regions (West, Midwest, Northeast, and South), coded as Regions 1-4. Right now I am able to create a table which shows the statistics in each region, but I would also like to add an additional column which shows what the overall average or median is for all individuals in the sample. How can I do that? I have included the code I've completed thus far. Thank you!

#the data frame 
structure(list(AGE = c(40L, 23L, 24L, 18L, 30L, 33L, 32L, 63L, 
22L, 24L), FAMSIZE = c(2L, 2L, 2L, 3L, 2L, 6L, 2L, 1L, 2L, 1L
), HYPERTEN = c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0), ALC = c(0, 2, 
3, 0, 2, 0, 3, 0, 2, 2), region_group = c("Region 4", "Region 3", 
"Region 4", "Region 3", "Region 1", "Region 2", "Region 1", "Region 2", 
"Region 4", "Region 4"), PSU = c(2L, 1L, 2L, 2L, 2L, 1L, 2L, 
2L, 1L, 2L), PERWEIGHT_MERGE = c(31.2, 615.2, 37.6, 1626, 44, 
149.8, 745.2, 984.2, 1512, 399.6), SAMPWEIGHT_MERGE = c(65, 860.4, 
94.4, 9146, 170.8, 310.4, 755.2, 1053.4, 3964.4, 706.2), STRATA = c(6296L, 
6165L, 6296L, 6224L, 6045L, 6083L, 6029L, 6073L, 6287L, 6247L
)), row.names = c(NA, 10L), class = "data.frame") 

#Code for the table 
out1<-sample_survey %>% 
  group_by(region_group) %>% 
  summarise("Number of drinks (mean)"=survey_mean(ALC),
            "Number of drinks (median)"=survey_median(ALC),"Hypertension"=survey_mean(HYPERTEN), "Family 
            Size"=survey_mean(FAMSIZE), "Age"=survey_median(AGE))

out1=t(out1)
out1 

                             [,1]         [,2]         [,3]         [,4]        
region_group                 "Region 1"   "Region 2"   "Region 3"   "Region 4"  
Number of drinks (mean)      "1.663778"   "2.131566"   "1.744107"   "2.009594"  
Number of drinks (mean)_se   "0.1375124"  "0.1245772"  "0.0957500"  "0.1199982" 
Number of drinks (median)    "1"          "2"          "1"          "2"         
Number of drinks (median)_se "0.0000000"  "0.2531528"  "0.0000000"  "0.2533324" 
Hypertension                 "0.1340147"  "0.1685102"  "0.1834528"  "0.1225418" 
Hypertension_se              "0.01623974" "0.01529678" "0.01463019" "0.01475651"
Family \n            Size    "3.121062"   "2.883905"   "3.107202"   "3.265012"  
Family \n            Size_se "0.11668906" "0.07435704" "0.08004129" "0.11138869"
Age                          "30"         "27"         "30"         "28"        
Age_se                       "1.3615690"  "1.0126110"  "0.7616152"  "0.7599972"  
juliah0494
  • 175
  • 11

2 Answers2

2

While I'm quite sure that there are some packages to achieve this out of the box one approach to achieve this would be to repeat the summary for the whole dataset and bind the tables together before transposing. To this end and to reduce code duplication I put the summarise code in a helper function. Try this:

library(dplyr)
library(srvyr)

# Helper function
mysum <- function(d) {
  d %>%
    summarise(
      "Number of drinks (mean)" = survey_mean(ALC),
      "Number of drinks (median)" = survey_median(ALC),
      "Hypertension" = survey_mean(HYPERTEN), 
      "Family Size" = survey_mean(FAMSIZE), 
      "Age" = survey_median(AGE)
    )
}

sample_survey <- sample_survey %>%
  as_survey_design(strata = region_group)

#Code for the table 
out1<-sample_survey %>% 
  group_by(region_group) %>% 
  mysum()

out2<-sample_survey %>% 
  mysum() %>% 
  mutate(region_group = "Total")

bind_rows(out1, out2) %>% 
  t()
#>                              [,1]         [,2]         [,3]        
#> region_group                 "Region 1"   "Region 2"   "Region 3"  
#> Number of drinks (mean)      "2.50"       "0.00"       "1.00"      
#> Number of drinks (mean)_se   "0.5000000"  "0.0000000"  "1.0000000" 
#> Number of drinks (median)    "2"          "0"          "0"         
#> Number of drinks (median)_se "0.03935085" "0.00000000" "0.07870171"
#> Hypertension                 "0.0"        "0.5"        "0.0"       
#> Hypertension_se              "0.0"        "0.5"        "0.0"       
#> Family Size                  "2.00"       "3.50"       "2.50"      
#> Family Size_se               "0.0000000"  "2.5000000"  "0.5000000" 
#> Age                          "30"         "33"         "18"        
#> Age_se                       "0.07870171" "1.18052560" "0.19675427"
#>                              [,4]         [,5]        
#> region_group                 "Region 4"   "Total"     
#> Number of drinks (mean)      "1.75"       "1.40"      
#> Number of drinks (mean)_se   "0.6291529"  "0.3366502" 
#> Number of drinks (median)    "2"          "2"         
#> Number of drinks (median)_se "0.47133552" "0.50276759"
#> Hypertension                 "0.0"        "0.1"       
#> Hypertension_se              "0.0"        "0.1"       
#> Family Size                  "1.75"       "2.30"      
#> Family Size_se               "0.2500000"  "0.5196152" 
#> Age                          "24"         "24"        
#> Age_se                       "2.82801315" "2.02169598"
stefan
  • 90,330
  • 6
  • 25
  • 51
1

You could also run the same calculations without including the group_by and then binding the new resulting column to the first table you made. (I had to go back and change survey_mean and survey_median to mean and median respectively, it was throwing me an error otherwise).

library(dplyr)
library(srvyr)
#the data frame 
sample_survey <- structure(list(AGE = c(40L, 23L, 24L, 18L, 30L, 33L, 32L, 63L, 22L, 24L), 
                       FAMSIZE = c(2L, 2L, 2L, 3L, 2L, 6L, 2L, 1L, 2L, 1L), 
                       HYPERTEN = c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0), 
                       ALC = c(0, 2, 3, 0, 2, 0, 3, 0, 2, 2), 
                       region_group = c("Region 4", "Region 3", "Region 4", "Region 3", 
                                        "Region 1", "Region 2", "Region 1", "Region 2", 
                                        "Region 4", "Region 4"), 
                       PSU = c(2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L), 
                       PERWEIGHT_MERGE = c(31.2, 615.2, 37.6, 1626, 44, 149.8, 745.2, 984.2, 1512, 399.6), 
                       SAMPWEIGHT_MERGE = c(65, 860.4, 94.4, 9146, 170.8, 310.4, 755.2, 1053.4, 3964.4, 706.2),
                       STRATA = c(6296L, 6165L, 6296L, 6224L, 6045L, 6083L, 6029L, 6073L, 6287L, 6247L )), row.names = c(NA, 10L), class = "data.frame") 

#Code for the table 
out1<-sample_survey %>% 
  group_by(region_group) %>% 
  summarise("Number of drinks (mean)"=mean(ALC),
            "Number of drinks (median)"=median(ALC),"Hypertension"=mean(HYPERTEN), "Family 
            Size"=mean(FAMSIZE), "Age"=median(AGE))

out1=t(out1)

#new code
out2 <- sample_survey %>%
  summarise("All Regions" = "All Regions",
            "Number of drinks (mean)"=mean(ALC),
            "Number of drinks (median)"=median(ALC),"Hypertension"=mean(HYPERTEN), "Family 
            Size"=mean(FAMSIZE), "Age"=median(AGE))
out2 = t(out2)
cbind(out1, out2)

> cbind(out1, out2)
                          [,1]       [,2]       [,3]       [,4]       [,5]         
region_group              "Region 1" "Region 2" "Region 3" "Region 4" "All Regions"
Number of drinks (mean)   "2.50"     "0.00"     "1.00"     "1.75"     "1.4"        
Number of drinks (median) "2.5"      "0.0"      "1.0"      "2.0"      "2"          
Hypertension              "0.0"      "0.5"      "0.0"      "0.0"      "0.1"        
Family \n            Size "2.00"     "3.50"     "2.50"     "1.75"     "2.3"        
Age                       "31.0"     "48.0"     "20.5"     "24.0"     "27"  
NovaEthos
  • 500
  • 2
  • 10