So I have this, kind of nonsense, example. I split up the iris
-dataframe into their species and try to predict if a observation is of species setosa
.
library(proc)
iris[["setosa"]] = ifelse(iris$Species == "setosa", TRUE, FALSE)
sample <- sample(c(TRUE, FALSE), nrow(iris), replace = T, prob = c(0.7, 0.3))
train = iris[sample, ]
test = iris[!sample, ]
I would then like to get the auroc for each of the models. I wanted to use a purrr:map
-approach, but I am a little confused about how I could do this. I have this, nonworking pseudo-like, code:
iris %>% split(., .$Species) %>%
map(~glm(setosa ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data =.x, family="binomial")) %>% {
tibble(
prediction = map_dbl(~predict(., newdata = test, type="response")),
referece = test$setosa
)
} %>%
map(~roc(.x$setosa ~ .x$prediction))
I think I am making multiple errors and maybe it would be better to do it all manually. But I think there must be a neat way to get this done with map
and its friends.
EDIT (better data)
Here is a little bit better example: I want to predict the trigger
-column using the variables p3
and p15
. I want to fit a model for each class of geology and get the auroc for the test dataset:
# the training data
structure(list(p3 = c(5, 0, 0, 2, 0, 0, 0, 2.8999999165535, 0,
0, 0, 0, 17.5999999046326, 0, 0, 0.899999976158142, 10.0000000149012,
1.79999995231628, 0, 8.80000019073486, 0, 0, 11.8999999761581,
0, 9.60000026226044, 5.19999980926514, 11.7000002861023, 20.0999999046326,
34.6000008583069, 0, 8.70000028610229, 33.8000000119209, 2.40000009536743,
17, 27, 36.7999992370605, 0, 15.8999997973442, 0.300000011920929,
7.69999980926514, 0, 0.899999976158142, 1.5, 0.300000011920929,
3.50000002980232, 51.3999991416931, 6.09999990463257, 0.400000005960464,
22, 65.3000020980835), p15 = c(34.2999999374151, 10.4999997392297,
8.30000010877848, 69.4999992623925, 1.30000001192093, 0, 62.3999992161989,
71.8999995738268, 32.6999994888902, 4, 0, 0.400000005960464,
35.699999935925, 24.1000001206994, 0, 53.8999998271465, 41.8999992236495,
37.8999994322658, 24.0999998524785, 65.5999999046326, 0, 0, 20.5000002905726,
75.4000002145767, 68.1999989748001, 45.9000007808208, 180.999998480082,
70.5000009462237, 112.099999666214, 11.3000001907349, 88.0999987274408,
103.499998867512, 100.399998664856, 59.9999995827675, 200.699998855591,
21.2999993562698, 47.1999997496605, 42.1999989748001, 58.6000000834465,
161.299998879433, 43.3999999314547, 110.899999141693, 73.9000004529953,
46.7999998703599, 25.7999995350838, 21.1000004559755, 86.100000500679,
15.8999998569489, 5.3999999538064, 143.399998903275), trigger = c(FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE),
geology = c("Sedimentary rocks", "Crystalline basement",
"Sedimentary rocks", "Sedimentary rocks", "Porphyry", "Porphyry",
"Porphyry", "Sedimentary rocks", "Sedimentary rocks", "Crystalline basement",
"Calcschists with ophiolites", "Crystalline basement", "Crystalline basement",
"Sedimentary rocks", "Porphyry", "Porphyry", "Crystalline basement",
"Crystalline basement", "Sedimentary rocks", "Crystalline basement",
"Sedimentary rocks", "Porphyry", "Crystalline basement",
"Crystalline basement", "Crystalline basement", "Crystalline basement",
"Sedimentary rocks", "Porphyry", "Sedimentary rocks", "Crystalline basement",
"Crystalline basement", "Crystalline basement", "Porphyry",
"Calcschists with ophiolites", "Crystalline basement", "Crystalline basement",
"Sedimentary rocks", "Porphyry", "Crystalline basement",
"Porphyry", "Crystalline basement", "Crystalline basement",
"Crystalline basement", "Crystalline basement", "Calcschists with ophiolites",
"Plutonite", "Crystalline basement", "Crystalline basement",
"Porphyry", "Sedimentary rocks")), row.names = c(NA, -50L
), class = c("tbl_df", "tbl", "data.frame"))
and the test data
# test data
# and the test data
structure(list(p3 = c(6.40000009536743, 0, 0, 16.3000003397465,
0, 0, 0, 0, 1, 1.5, 29.1000003814697, 7.49999982118607, 3.5,
18.3999996185303, 2.69999990612268, 11.3000001907349, 0, 2, 9.5,
0, 10.1999998092651, 0, 3.60000005364418, 0, 5.29999995231628,
112.599998474121, 118.099997758865, 54.8999996185303, 72.8000011444092,
79.9000015258789, 88.7000015377998, 0, 54.6000022888184, 144.599998474121,
111.200000762939, 7.10000009834766, 32.0999999046326, 0.5, 5.3999999165535,
0.300000011920929, 0, 36.7999982833862, 101.599998474121, 121.699998855591,
31.0999994277954, 66.8000020980835, 139.200000762939, 9.50000011920929,
135.300003051758, 110.900001525879), p15 = c(12.3999996185303,
63.8000009655952, 20.7000007629395, 121.299998179078, 10.4000001549721,
27.1999999880791, 49.5000003874302, 13.3000001907349, 31.3999998569489,
15.4000002890825, 64.3999997377396, 25.1000001430511, 43.6999994516373,
50.799999833107, 35.1999998092651, 35.1999998837709, 67.1000003442168,
19.400000333786, 49.300000667572, 21.3999996706843, 75.600000411272,
38.700000859797, 30.2999994754791, 14.9000003933907, 53.2000011727214,
137.900000333786, 0.100000001490116, 119.300001859665, 139.700000107288,
147.799997329712, 45.3000004068017, 56.5000000670552, 47.7999995946884,
2.90000009536743, 139.499999403954, 26.6999999284744, 6.5, 149.700001835823,
210.299998342991, 114.499999642372, 3.60000002384186, 60.099999524653,
97.5999984890223, 153.100000120699, 245.299996376038, 123.49999922514,
3.70000004768372, 90.5999985486269, 49.1000001132488, 138.599999785423
), trigger = c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE)), row.names = c(NA, -50L), groups = structure(list(
trigger = c(FALSE, TRUE), .rows = structure(list(1:25, 26:50), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
I then would do something like the above:
train %>% split(., .$geology) %>%
map(~glm(trigger ~ p3 + p15, data = .x)) %>% {
tibble(
prediction = map_dfr(~predict(.x, newdata=test, type="response")),
ref = test$trigger,
auroc = as.numeric(roc(ref ~ prediction, data = test)[[1]])
)
}
But this does not work at all...