Perform Chi Square Tests on Multiple Columns from the Same Data Frame

Question

I am trying to write a function that will group the columns I have in an existing data frame and conduct chi square tests on each column's associated matrix/contingency table and then report the p-values for each test. I have been trying to mimic the approach used here, but I am finding that my M values are not being formatted into matrices like I think I need them to. I am not sure if it is because I have more columns than the example in the above link had or if I am just missing something, but here is a sample of the data structure I have so far:

require(lubridate)

structure(list(ResponseID = c("R_2fpKxLYlxAoplxP", "R_enci4Hwwee9XLSp", 
"R_332X6CmsgY6RE5s", "R_3GAI7CSx4a74LVp", "R_2QXRRBh4UCFoHDl", 
"R_3gSKU8piHOKWf9E"), region = structure(c(1L, 1L, 1L, 1L, 1L, 
1L), .Label = c("Lakeland", "Macon SE", "Other"), class = "factor"), 
    InCalls_Qrtl = structure(c(7L, 7L, 7L, 7L, 7L, 7L), .Label = c("NA", 
    "No EDGE Calls", "Bottom Quartile", "Second Quartile", "Third Quartile", 
    "Top Quartile", "Missing"), class = "factor"), InAHT_Qrtl = structure(c(7L, 
    7L, 7L, 7L, 7L, 7L), .Label = c("NA", "No EDGE Calls", "Bottom Quartile", 
    "Second Quartile", "Third Quartile", "Top Quartile", "Missing"
    ), class = "factor"), InHold_Qrtl = structure(c(7L, 7L, 7L, 
    7L, 7L, 7L), .Label = c("NA", "No EDGE Calls", "Bottom Quartile", 
    "Second Quartile", "Third Quartile", "Top Quartile", "Missing"
    ), class = "factor"), Overall_Tenure_Period = new("Period", 
        .Data = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
        NA_real_), year = c(NA_real_, NA_real_, NA_real_, NA_real_, 
        NA_real_, NA_real_), month = c(NA_real_, NA_real_, NA_real_, 
        NA_real_, NA_real_, NA_real_), day = c(NA_real_, NA_real_, 
        NA_real_, NA_real_, NA_real_, NA_real_), hour = c(NA_real_, 
        NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), minute = c(NA_real_, 
        NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), Recent_Tenure_Period = new("Period", 
        .Data = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
        NA_real_), year = c(NA_real_, NA_real_, NA_real_, NA_real_, 
        NA_real_, NA_real_), month = c(NA_real_, NA_real_, NA_real_, 
        NA_real_, NA_real_, NA_real_), day = c(NA_real_, NA_real_, 
        NA_real_, NA_real_, NA_real_, NA_real_), hour = c(NA_real_, 
        NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), minute = c(NA_real_, 
        NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), VOA_Overall_Tenure_Orig = structure(c(9L, 
    9L, 9L, 9L, 9L, 9L), .Label = c("< 1 Year", "1 Year - <2 Years", 
    "2-5 Years", "6-10 Years", "11-15 Years", "16-20 Years", 
    "21-25 Years", "26 Years or Longer", "Missing"), class = "factor"), 
    VOA_Overall_Tenure_Mod = structure(c(5L, 5L, 5L, 5L, 5L, 
    5L), .Label = c("<2 Years", "2-5 Years", "6-10 Years", ">10 Years", 
    "Missing"), class = "factor"), VOA_Recent_Tenure_Orig = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_), VOA_Recent_Tenure_Mod = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_), Region = c("Lakeland", "Lakeland", "Lakeland", 
    "Lakeland", "Lakeland", "Lakeland"), Tenure_Code_Least = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), Tenure_Code_Most = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), Tenure_Code_Split = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), InCalls_Least = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), InCalls_Top = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), InCalls_Top2 = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), InAHT_Least = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), InAHT_Top = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), InAHT_Top2 = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), InHold_Least = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), InHold_Top = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), InHold_Top2 = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), Question = c("Overall, how satisfied are you with using EDGE?", 
    "Overall, how satisfied are you with using EDGE?", "Overall, how satisfied are you with using EDGE?", 
    "Overall, how satisfied are you with using EDGE?", "Overall, how satisfied are you with using EDGE?", 
    "Overall, how satisfied are you with using EDGE?"), Answer = c("Slightly Satisfied", 
    "Slightly Satisfied", "Slightly Satisfied", "Dissatisfied", 
    "Completely Dissatisfied", "Slightly Dissatisfied"), Answer_TopBox = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), Answer_Top2 = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), Answer_GenSat = c("GenSat", 
    "GenSat", "GenSat", "Rest", "Rest", "Rest"), Answer_Bottom = c("Rest", 
    "Rest", "Rest", "Rest", "Bottom", "Rest"), Answer_Bottom2 = c("Rest", 
    "Rest", "Rest", "Bottom2", "Rest", "Rest"), Answer_GenDissat = c("Rest", 
    "Rest", "Rest", "GenDissat", "GenDissat", "GenDissat")), row.names = c(NA, 
6L), class = "data.frame")

I then attempted to recreate the example by doing the following:

top_score_tests_agent <- as.data.frame(agent_data_clean_coded %>%
  group_by(Region, Tenure_Code_Least, Tenure_Code_Most, Tenure_Code_Split,InCalls_Least, InCalls_Top, InCalls_Top2, InAHT_Least, 
           InAHT_Top, InAHT_Top2, InHold_Least, InHold_Top, InHold_Top2, Question, Answer_TopBox) %>%
  summarise(freq = n())) %>% group_by(Tenure_Code_Least, Tenure_Code_Most, Tenure_Code_Split,InCalls_Least, InCalls_Top, InCalls_Top2, InAHT_Least, 
                                      InAHT_Top, InAHT_Top2, InHold_Least, InHold_Top, InHold_Top2, Question) %>%
  nest() %>%
  mutate(M = map(data, function(dat){
    dat2 <- dat %>% spread(Region, freq)
    M <- as.matrix(dat2[, -1])
    row.names(M) <- dat2$Answer_TopBox
    return(M)
  }))

But I find that if I try to check the matrix-creation by checking the top_score_tests_agent$M[[1]] result, I get the following output:

structure(4L, .Dim = c(1L, 1L), .Dimnames = list(NULL, "Top Box"))

I am just wondering if anyone had any insights on what I am doing wrong that is preventing my matrices from being created or if anyone had any other approaches that they've used to do this?

Edit

I was able to use a majority of the code @Wietze314 wrote, but for any future users interested in seeing the final code:

result2 <- df %>% select(Region, Question, starts_with("Answer")) %>%
  gather(segment, answer,-Region, -Question) %>%
  group_by(Question, segment) %>%
  nest() %>%
  mutate(test = map(data, ~chisq.test(.x$Region,.x$answer, correct=FALSE))) %>%
  mutate(p = map_dbl(test, pluck,'p.value'),
         Status = ifelse(p<=0.01, "99% Sig Difference", ifelse(
           p>0.01 & p<=0.05, "95% Sig Difference", ifelse(
             p>0.05 & p<=0.1, "90% Sig Difference", "Not Significant")))) %>%
  select(-data, -test)

Which gives me an output that looks like:

structure(list(Question = c("I feel comfortable 'trusting the system' with EDGE", 
"EDGE allows me to be more efficient", "The E-learning training (GU Courses)", 
"When I have questions about EDGE, I feel confident they will be answered", 
"Overall, the training I received prepared me to use EDGE", "Overall, how satisfied are you with using EDGE?", 
"The in-person, instructor-led training", "The formal training you received in EDGE", 
"EDGE allows me to be more efficient", "I feel comfortable 'trusting the system' with EDGE"
), Segment = c("Answer_GenDissat", "Answer_Bottom", "Answer_GenSat", 
"Answer_TopBox", "Answer_GenDissat", "Answer_TopBox", "Answer_Top2", 
"Answer_Top2", "Answer_Bottom2", "Answer_GenSat"), pvalue = c(0.231403084430793, 
0.299890413606335, 0.00108798852510237, 0.487810952072342, 0.131641662666334, 
0.31818165042123, 0.501077891603077, 0.634730681199174, 0.389259022098406, 
0.274277276570632), Status = c("Not Significant", "Not Significant", 
"99% Sig Difference", "Not Significant", "Not Significant", "Not Significant", 
"Not Significant", "Not Significant", "Not Significant", "Not Significant"
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-10L))

From your data and question it is not really clear which variables you are trying to compare with a chisq.test and in which subset(s) of the data. The example you linked uses a long data format, and compares variables `allele` and `sex` for each `ecotype` and `contigID` combination. — Wietze314, Sep 17 '19 at 18:33
I don't understand the question. That structure object _is_ a matrix. `is.matrix( structure(4L, .Dim = c(1L, 1L), .Dimnames = list(NULL, "Top Box")) ) #[1] TRUE` — IRTFM, Sep 17 '19 at 19:01
Furthermore, an effort to assign that larger structure to an object name fails with: `Error in getClass(Class, where = topenv(parent.frame())) : “Period” is not a defined class` — IRTFM, Sep 17 '19 at 19:05
Sorry, so the variables I am trying to compare are the Region/Tenure/InCall/InAHT/InHold variables against the different Answer variables. For example, the Region field should be grouped into 2x2 matrices for the Answer_TopBox, Answer_Top2, Answer_GenSat, etc. so that I would end up with 6 matrices in total that I can use to conduct the chi sq test. — CPG, Sep 17 '19 at 19:08
@Wietze314 Perhaps you should edit the question to make it reproducible. — IRTFM, Sep 17 '19 at 21:21

score 0 · Accepted Answer · answered Sep 17 '19 at 19:33

I think I understand what the goal is. I duplicated the dataset, since there is only one value in Region.

require(tidyverse)

df <- agent_data_clean_coded %>%
  bind_rows(agent_data_clean_coded %>% mutate(Region = "other"))

result <- df %>% select(Region, starts_with("Answer")) %>%
  gather(question, answer,-Region) %>%
  group_by(question) %>%
  nest() %>%
  mutate(M = map(data, function(dat){
    dat2 <- dat %>% 
      group_by(Region,answer) %>%
      summarise(freq = n()) %>% 
      spread(Region, freq)
    M <- as.matrix(dat2[, -1])
    row.names(M) <- dat2$answer
    return(M)
  }))

I am used to tackle this in a different way: For this option I also excluded both Answer_Top variables since they also contain one level. Otherwise chisq.test will give an error. In this case I use the raw data with chisq.test instead of a contingency table.

result2 <- df %>% select(Region, starts_with("Answer")) %>%
  select(-contains("Top")) %>%
  gather(question, answer,-Region) %>%
  group_by(question) %>%
  nest() %>%
  mutate(test = map(data, ~
    chisq.test(.x$Region,.x$answer))) %>%
  mutate(p = map_dbl(test, pluck,'p.value'))

Thanks @Wietze314! This got me about 90% of the way to where I wanted it; I'll add my final code snippet/output to the original question. And thanks for also showing me the second way; definitely looks cleaner than the way I was first trying to do it. — CPG, Sep 18 '19 at 12:15

Perform Chi Square Tests on Multiple Columns from the Same Data Frame

Edit

1 Answers1