Purrr map over splitted training-dataframe to get auroc for each model

Question

So I have this, kind of nonsense, example. I split up the iris-dataframe into their species and try to predict if a observation is of species setosa.

library(proc)

iris[["setosa"]] = ifelse(iris$Species == "setosa", TRUE, FALSE)
sample <- sample(c(TRUE, FALSE), nrow(iris), replace = T, prob = c(0.7, 0.3))
train = iris[sample, ]
test = iris[!sample, ]

I would then like to get the auroc for each of the models. I wanted to use a purrr:map-approach, but I am a little confused about how I could do this. I have this, nonworking pseudo-like, code:

iris %>% split(., .$Species) %>% 
  map(~glm(setosa ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data =.x, family="binomial")) %>% {
    tibble(
      prediction = map_dbl(~predict(., newdata = test, type="response")),
      referece = test$setosa
    )
  } %>% 
  map(~roc(.x$setosa ~ .x$prediction))

I think I am making multiple errors and maybe it would be better to do it all manually. But I think there must be a neat way to get this done with map and its friends.

EDIT (better data)

Here is a little bit better example: I want to predict the trigger-column using the variables p3 and p15. I want to fit a model for each class of geology and get the auroc for the test dataset:

# the training data
structure(list(p3 = c(5, 0, 0, 2, 0, 0, 0, 2.8999999165535, 0, 
0, 0, 0, 17.5999999046326, 0, 0, 0.899999976158142, 10.0000000149012, 
1.79999995231628, 0, 8.80000019073486, 0, 0, 11.8999999761581, 
0, 9.60000026226044, 5.19999980926514, 11.7000002861023, 20.0999999046326, 
34.6000008583069, 0, 8.70000028610229, 33.8000000119209, 2.40000009536743, 
17, 27, 36.7999992370605, 0, 15.8999997973442, 0.300000011920929, 
7.69999980926514, 0, 0.899999976158142, 1.5, 0.300000011920929, 
3.50000002980232, 51.3999991416931, 6.09999990463257, 0.400000005960464, 
22, 65.3000020980835), p15 = c(34.2999999374151, 10.4999997392297, 
8.30000010877848, 69.4999992623925, 1.30000001192093, 0, 62.3999992161989, 
71.8999995738268, 32.6999994888902, 4, 0, 0.400000005960464, 
35.699999935925, 24.1000001206994, 0, 53.8999998271465, 41.8999992236495, 
37.8999994322658, 24.0999998524785, 65.5999999046326, 0, 0, 20.5000002905726, 
75.4000002145767, 68.1999989748001, 45.9000007808208, 180.999998480082, 
70.5000009462237, 112.099999666214, 11.3000001907349, 88.0999987274408, 
103.499998867512, 100.399998664856, 59.9999995827675, 200.699998855591, 
21.2999993562698, 47.1999997496605, 42.1999989748001, 58.6000000834465, 
161.299998879433, 43.3999999314547, 110.899999141693, 73.9000004529953, 
46.7999998703599, 25.7999995350838, 21.1000004559755, 86.100000500679, 
15.8999998569489, 5.3999999538064, 143.399998903275), trigger = c(FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE), 
    geology = c("Sedimentary rocks", "Crystalline basement", 
    "Sedimentary rocks", "Sedimentary rocks", "Porphyry", "Porphyry", 
    "Porphyry", "Sedimentary rocks", "Sedimentary rocks", "Crystalline basement", 
    "Calcschists with ophiolites", "Crystalline basement", "Crystalline basement", 
    "Sedimentary rocks", "Porphyry", "Porphyry", "Crystalline basement", 
    "Crystalline basement", "Sedimentary rocks", "Crystalline basement", 
    "Sedimentary rocks", "Porphyry", "Crystalline basement", 
    "Crystalline basement", "Crystalline basement", "Crystalline basement", 
    "Sedimentary rocks", "Porphyry", "Sedimentary rocks", "Crystalline basement", 
    "Crystalline basement", "Crystalline basement", "Porphyry", 
    "Calcschists with ophiolites", "Crystalline basement", "Crystalline basement", 
    "Sedimentary rocks", "Porphyry", "Crystalline basement", 
    "Porphyry", "Crystalline basement", "Crystalline basement", 
    "Crystalline basement", "Crystalline basement", "Calcschists with ophiolites", 
    "Plutonite", "Crystalline basement", "Crystalline basement", 
    "Porphyry", "Sedimentary rocks")), row.names = c(NA, -50L
), class = c("tbl_df", "tbl", "data.frame"))

and the test data

# test data

# and the test data

structure(list(p3 = c(6.40000009536743, 0, 0, 16.3000003397465, 
0, 0, 0, 0, 1, 1.5, 29.1000003814697, 7.49999982118607, 3.5, 
18.3999996185303, 2.69999990612268, 11.3000001907349, 0, 2, 9.5, 
0, 10.1999998092651, 0, 3.60000005364418, 0, 5.29999995231628, 
112.599998474121, 118.099997758865, 54.8999996185303, 72.8000011444092, 
79.9000015258789, 88.7000015377998, 0, 54.6000022888184, 144.599998474121, 
111.200000762939, 7.10000009834766, 32.0999999046326, 0.5, 5.3999999165535, 
0.300000011920929, 0, 36.7999982833862, 101.599998474121, 121.699998855591, 
31.0999994277954, 66.8000020980835, 139.200000762939, 9.50000011920929, 
135.300003051758, 110.900001525879), p15 = c(12.3999996185303, 
63.8000009655952, 20.7000007629395, 121.299998179078, 10.4000001549721, 
27.1999999880791, 49.5000003874302, 13.3000001907349, 31.3999998569489, 
15.4000002890825, 64.3999997377396, 25.1000001430511, 43.6999994516373, 
50.799999833107, 35.1999998092651, 35.1999998837709, 67.1000003442168, 
19.400000333786, 49.300000667572, 21.3999996706843, 75.600000411272, 
38.700000859797, 30.2999994754791, 14.9000003933907, 53.2000011727214, 
137.900000333786, 0.100000001490116, 119.300001859665, 139.700000107288, 
147.799997329712, 45.3000004068017, 56.5000000670552, 47.7999995946884, 
2.90000009536743, 139.499999403954, 26.6999999284744, 6.5, 149.700001835823, 
210.299998342991, 114.499999642372, 3.60000002384186, 60.099999524653, 
97.5999984890223, 153.100000120699, 245.299996376038, 123.49999922514, 
3.70000004768372, 90.5999985486269, 49.1000001132488, 138.599999785423
), trigger = c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, 
TRUE, TRUE, TRUE)), row.names = c(NA, -50L), groups = structure(list(
    trigger = c(FALSE, TRUE), .rows = structure(list(1:25, 26:50), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

I then would do something like the above:


train %>% split(., .$geology) %>%
  map(~glm(trigger ~ p3 + p15, data = .x)) %>% {
    tibble(
      prediction = map_dfr(~predict(.x, newdata=test, type="response")),
      ref = test$trigger,
      auroc = as.numeric(roc(ref ~ prediction, data = test)[[1]])
    )
}

But this does not work at all...

would `group_map` be an option for you? Instead of splitting and mapping? — c0bra, Jun 25 '21 at 15:29
What keeps me wondering about this question: you want to calculate a model over each group in `Species` with `setosa` as outcome variable. The model won't work in neither group because there is no variation in your outcome variable. Maybe you can provide a better example, because at the moment this does not make sense. — TimTeaFan, Jun 26 '21 at 11:12
thats totally true.... Sorry about that:/ I will come up with a better example. Would it then be better to edit the question or to make a new question? — Lenn, Jun 27 '21 at 11:37

score 1 · Answer 1 · answered Jun 28 '21 at 16:11

I would suggest the following code:

train %>% group_by(geology) %>%
  group_modify( function(x, y) {
    model <- glm(trigger ~ p3 + p15, data=x)
    tibble(
      prediction = predict(model, newdata=test, type="response"),
      ref = test$trigger,
      auroc = as.numeric(pROC::roc(ref ~ prediction, data = test)[[1]])
    )
  })

I think it would make more sense to not combine prediction data with KPIs for prediction quality. Something like this:

train %>% group_by(geology) %>% 
  group_modify( function(x, y) {
    model <- glm(trigger ~ p3 + p15, data=x)
    prediction = predict(model, newdata=test, type="response")
    tibble(
      auroc = as.numeric(pROC::roc(test$trigger ~ prediction, data = test)[[1]])
    )
  })

Purrr map over splitted training-dataframe to get auroc for each model

1 Answers1

Linked