I am trying to write a function that will group the columns I have in an existing data frame and conduct chi square tests on each column's associated matrix/contingency table and then report the p-values for each test. I have been trying to mimic the approach used here, but I am finding that my M values are not being formatted into matrices like I think I need them to. I am not sure if it is because I have more columns than the example in the above link had or if I am just missing something, but here is a sample of the data structure I have so far:
require(lubridate)
structure(list(ResponseID = c("R_2fpKxLYlxAoplxP", "R_enci4Hwwee9XLSp",
"R_332X6CmsgY6RE5s", "R_3GAI7CSx4a74LVp", "R_2QXRRBh4UCFoHDl",
"R_3gSKU8piHOKWf9E"), region = structure(c(1L, 1L, 1L, 1L, 1L,
1L), .Label = c("Lakeland", "Macon SE", "Other"), class = "factor"),
InCalls_Qrtl = structure(c(7L, 7L, 7L, 7L, 7L, 7L), .Label = c("NA",
"No EDGE Calls", "Bottom Quartile", "Second Quartile", "Third Quartile",
"Top Quartile", "Missing"), class = "factor"), InAHT_Qrtl = structure(c(7L,
7L, 7L, 7L, 7L, 7L), .Label = c("NA", "No EDGE Calls", "Bottom Quartile",
"Second Quartile", "Third Quartile", "Top Quartile", "Missing"
), class = "factor"), InHold_Qrtl = structure(c(7L, 7L, 7L,
7L, 7L, 7L), .Label = c("NA", "No EDGE Calls", "Bottom Quartile",
"Second Quartile", "Third Quartile", "Top Quartile", "Missing"
), class = "factor"), Overall_Tenure_Period = new("Period",
.Data = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), year = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), month = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), day = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), hour = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), minute = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), Recent_Tenure_Period = new("Period",
.Data = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), year = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), month = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), day = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), hour = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), minute = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), VOA_Overall_Tenure_Orig = structure(c(9L,
9L, 9L, 9L, 9L, 9L), .Label = c("< 1 Year", "1 Year - <2 Years",
"2-5 Years", "6-10 Years", "11-15 Years", "16-20 Years",
"21-25 Years", "26 Years or Longer", "Missing"), class = "factor"),
VOA_Overall_Tenure_Mod = structure(c(5L, 5L, 5L, 5L, 5L,
5L), .Label = c("<2 Years", "2-5 Years", "6-10 Years", ">10 Years",
"Missing"), class = "factor"), VOA_Recent_Tenure_Orig = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), VOA_Recent_Tenure_Mod = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), Region = c("Lakeland", "Lakeland", "Lakeland",
"Lakeland", "Lakeland", "Lakeland"), Tenure_Code_Least = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), Tenure_Code_Most = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), Tenure_Code_Split = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InCalls_Least = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InCalls_Top = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InCalls_Top2 = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InAHT_Least = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InAHT_Top = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InAHT_Top2 = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InHold_Least = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InHold_Top = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InHold_Top2 = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), Question = c("Overall, how satisfied are you with using EDGE?",
"Overall, how satisfied are you with using EDGE?", "Overall, how satisfied are you with using EDGE?",
"Overall, how satisfied are you with using EDGE?", "Overall, how satisfied are you with using EDGE?",
"Overall, how satisfied are you with using EDGE?"), Answer = c("Slightly Satisfied",
"Slightly Satisfied", "Slightly Satisfied", "Dissatisfied",
"Completely Dissatisfied", "Slightly Dissatisfied"), Answer_TopBox = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), Answer_Top2 = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), Answer_GenSat = c("GenSat",
"GenSat", "GenSat", "Rest", "Rest", "Rest"), Answer_Bottom = c("Rest",
"Rest", "Rest", "Rest", "Bottom", "Rest"), Answer_Bottom2 = c("Rest",
"Rest", "Rest", "Bottom2", "Rest", "Rest"), Answer_GenDissat = c("Rest",
"Rest", "Rest", "GenDissat", "GenDissat", "GenDissat")), row.names = c(NA,
6L), class = "data.frame")
I then attempted to recreate the example by doing the following:
top_score_tests_agent <- as.data.frame(agent_data_clean_coded %>%
group_by(Region, Tenure_Code_Least, Tenure_Code_Most, Tenure_Code_Split,InCalls_Least, InCalls_Top, InCalls_Top2, InAHT_Least,
InAHT_Top, InAHT_Top2, InHold_Least, InHold_Top, InHold_Top2, Question, Answer_TopBox) %>%
summarise(freq = n())) %>% group_by(Tenure_Code_Least, Tenure_Code_Most, Tenure_Code_Split,InCalls_Least, InCalls_Top, InCalls_Top2, InAHT_Least,
InAHT_Top, InAHT_Top2, InHold_Least, InHold_Top, InHold_Top2, Question) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread(Region, freq)
M <- as.matrix(dat2[, -1])
row.names(M) <- dat2$Answer_TopBox
return(M)
}))
But I find that if I try to check the matrix-creation by checking the top_score_tests_agent$M[[1]] result, I get the following output:
structure(4L, .Dim = c(1L, 1L), .Dimnames = list(NULL, "Top Box"))
I am just wondering if anyone had any insights on what I am doing wrong that is preventing my matrices from being created or if anyone had any other approaches that they've used to do this?
Edit
I was able to use a majority of the code @Wietze314 wrote, but for any future users interested in seeing the final code:
result2 <- df %>% select(Region, Question, starts_with("Answer")) %>%
gather(segment, answer,-Region, -Question) %>%
group_by(Question, segment) %>%
nest() %>%
mutate(test = map(data, ~chisq.test(.x$Region,.x$answer, correct=FALSE))) %>%
mutate(p = map_dbl(test, pluck,'p.value'),
Status = ifelse(p<=0.01, "99% Sig Difference", ifelse(
p>0.01 & p<=0.05, "95% Sig Difference", ifelse(
p>0.05 & p<=0.1, "90% Sig Difference", "Not Significant")))) %>%
select(-data, -test)
Which gives me an output that looks like:
structure(list(Question = c("I feel comfortable 'trusting the system' with EDGE",
"EDGE allows me to be more efficient", "The E-learning training (GU Courses)",
"When I have questions about EDGE, I feel confident they will be answered",
"Overall, the training I received prepared me to use EDGE", "Overall, how satisfied are you with using EDGE?",
"The in-person, instructor-led training", "The formal training you received in EDGE",
"EDGE allows me to be more efficient", "I feel comfortable 'trusting the system' with EDGE"
), Segment = c("Answer_GenDissat", "Answer_Bottom", "Answer_GenSat",
"Answer_TopBox", "Answer_GenDissat", "Answer_TopBox", "Answer_Top2",
"Answer_Top2", "Answer_Bottom2", "Answer_GenSat"), pvalue = c(0.231403084430793,
0.299890413606335, 0.00108798852510237, 0.487810952072342, 0.131641662666334,
0.31818165042123, 0.501077891603077, 0.634730681199174, 0.389259022098406,
0.274277276570632), Status = c("Not Significant", "Not Significant",
"99% Sig Difference", "Not Significant", "Not Significant", "Not Significant",
"Not Significant", "Not Significant", "Not Significant", "Not Significant"
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-10L))