I have a data set of two courses in 2 different semesters that takes the following form:
set.seed(200)
sem <- sample(c("1", "2"), 200, replace = T)
course <- sample(c("1", "2"), 200, replace = T)
d.gender = sample(c(0, 1), 200, replace = T, prob = c(0.6, 0.4))
d.pass = sample(c(0, 1), 200, replace = T, prob = c(0.7, 0.3))
df <- data.frame(sem, course, d.gender, d.pass)
I'm trying to efficiently create a tbl of the 4 different sem,course combinations along with their total pass rate, the percentage of d.gender = 1, and finally the pass rates within those the 2 gender categories. I can make a table that provides all the values I need to calculate, but I know there's a more efficient way to calculate what I need without nesting a bunch of different group_by and summary functions, or making a whole bunch of different tbls and left_joining the columns I want. I can get what I need grinding away with indices and subset functions, but I'm hopeful that there is a better way to get a 4-row matrix with everything I need but it's ugly and takes forever, and it's easy to make mistakes in the code:
df1 <- df %>% group_by(sem, course, d.gender, d.pass) %>% summarize(total = n())
df1$total_pass <- rep(NA, dim(df1)[1])
df1$total_pass[1:4] <- sum(subset(df1, sem == "1" & course == "1" & d.pass == "1",
select = total))
df1$total_pass[5:8] <- sum(subset(df1, sem == "1" & course == "2" & d.pass == "1",
select = total))
df1$total_pass[9:12] <- sum(subset(df1, sem == "2" & course == "1" & d.pass == "1",
select = total))
df1$total_pass[13:16] <- sum(subset(df1, sem == "2" & course == "2" & d.pass == "1",
select = total))
df1$n_male <- rep(NA, dim(df1)[1])
df1$n_male[1:4] <- sum(subset(df1, sem == "1" & course == "1" & d.gender == "1",
select = total))
df1$n_male[5:8] <- sum(subset(df1, sem == "1" & course == "2" & d.gender == "1",
select = total))
df1$n_male[9:12] <- sum(subset(df1, sem == "2" & course == "1" & d.gender == "1",
select = total))
df1$n_male[13:16] <- sum(subset(df1, sem == "2" & course == "2" & d.gender == "1",
select = total))
df1$n_fem <- rep(NA, dim(df1)[1])
df1$n_fem[1:4] <- sum(subset(df1, sem == "1" & course == "1" & d.gender == "0", select = total))
df1$n_fem[5:8] <- sum(subset(df1, sem == "1" & course == "2" & d.gender == "0", select = total))
df1$n_fem[9:12] <- sum(subset(df1, sem == "2" & course == "1" & d.gender == "0",
select = total))
df1$n_fem[13:16] <- sum(subset(df1, sem == "2" & course == "2" & d.gender == "0",
select = total))
df1$pct_male <- rep(NA, dim(df1)[1])
df1$pct_male[1:4] <- df1$n_male[1:4]/sum(subset(df1, sem == "1" & course == "1",
select = total))
df1$pct_male[5:8] <- df1$n_male[5:8]/sum(subset(df1, sem == "1" & course == "2",
select = total))
df1$pct_male[9:12] <- df1$n_male[9:12]/sum(subset(df1, sem == "2" & course == "1",
select = total))
df1$pct_male[13:16] <- df1$n_male[13:16]/sum(subset(df1, sem == "2" & course == "2",
select = total))
df1$pct_fem <- rep(NA, dim(df1)[1])
df1$pct_fem <- 1 - df1$pct_male
df1$pct_pass <- rep(NA, dim(df1)[1])
df1$pct_pass[1:4] <- df1$total_pass[1:4]/sum(subset(df1, sem == "1" & course == "1",
select = total))
df1$pct_pass[5:8] <- df1$total_pass[5:8]/sum(subset(df1, sem == "1" & course == "2",
select = total))
df1$pct_pass[9:12] <- df1$total_pass[9:12]/sum(subset(df1, sem == "2" & course ==
"1", select = total))
df1$pct_pass[13:16] <- df1$total_pass[13:16]/sum(subset(df1, sem == "2" & course ==
"2", select = total))
df1$male_pass_pct <- rep(NA, dim(df1)[1])
df1$male_pass_pct[1:4] <- subset(df1, sem == "1" & course == "1" & d.gender == "1" &
d.pass == "1", select = total)/df1$n_male[1:4]
df1$male_pass_pct[5:8] <- subset(df1, sem == "1" & course == "2" & d.gender == "1" &
d.pass == "1", select = total)/df1$n_male[5:8]
df1$male_pass_pct[9:12] <- subset(df1, sem == "2" & course == "1" & d.gender == "1" &
d.pass == "1", select = total)/df1$n_male[9:12]
df1$male_pass_pct[13:16] <- subset(df1, sem == "2" & course == "2" & d.gender ==
"1" & d.pass == "1", select = total)/df1$n_male[13:16]
df1$fem_pass_pct <- rep(NA, dim(df1)[1])
df1$fem_pass_pct[1:4] <- subset(df1, sem == "1" & course == "1" & d.gender == "0" &
d.pass == "1", select = total)/df1$n_fem[1:4]
df1$fem_pass_pct[5:8] <- subset(df1, sem == "1" & course == "2" & d.gender == "0" &
d.pass == "1", select = total)/df1$n_fem[5:8]
df1$fem_pass_pct[9:12] <- subset(df1, sem == "2" & course == "1" & d.gender == "0" &
d.pass == "1", select = total)/df1$n_fem[9:12]
df1$fem_pass_pct[13:16] <- subset(df1, sem == "2" & course == "2" & d.gender == "0" &
d.pass == "1", select = total)/df1$n_fem[13:16]
df2 <- df1 %>%
group_by(sem, course) %>%
summarize(total_pass = first(total_pass),
pct_pass = first(pct_pass),
n_male = first(n_male),
n_fem = first(n_fem),
pct_male = first(pct_male),
pct_fem = first(pct_fem),
male_pass_pct = first(male_pass_pct),
fem_pass_pct = first(fem_pass_pct))
df2 <- unique(df1[, c(1, 2, 6, 11, 7:10, 12, 13)])
df2[, c(9, 10)] <- lapply(df2[, c(9, 10)], as.numeric)
that's really laborious for only needing measures for 4 rows, but I can't get it to work otherwise for this aggregation... Any help would be awesome