I asked this question already here (Purrr map over splitted training-dataframe to get auroc for each model), but the data was really bad and the question maybe a bit confusing. In addition I found a way to resolve this, but it does not feel like a good way.
So the issue is the following. I split a dataframe based on its geology
-column. I then want to predict the column trigger
using two other columns in the training dataframe. And I want to do this for each geology (each element - dataframe - of the list).
I then want to predict this model on a new testdataframe and compute the AUROC for each geology.
Here is the training data:
# the training data
structure(list(p3 = c(5, 0, 0, 2, 0, 0, 0, 2.8999999165535, 0,
0, 0, 0, 17.5999999046326, 0, 0, 0.899999976158142, 10.0000000149012,
1.79999995231628, 0, 8.80000019073486, 0, 0, 11.8999999761581,
0, 9.60000026226044, 5.19999980926514, 11.7000002861023, 20.0999999046326,
34.6000008583069, 0, 8.70000028610229, 33.8000000119209, 2.40000009536743,
17, 27, 36.7999992370605, 0, 15.8999997973442, 0.300000011920929,
7.69999980926514, 0, 0.899999976158142, 1.5, 0.300000011920929,
3.50000002980232, 51.3999991416931, 6.09999990463257, 0.400000005960464,
22, 65.3000020980835), p15 = c(34.2999999374151, 10.4999997392297,
8.30000010877848, 69.4999992623925, 1.30000001192093, 0, 62.3999992161989,
71.8999995738268, 32.6999994888902, 4, 0, 0.400000005960464,
35.699999935925, 24.1000001206994, 0, 53.8999998271465, 41.8999992236495,
37.8999994322658, 24.0999998524785, 65.5999999046326, 0, 0, 20.5000002905726,
75.4000002145767, 68.1999989748001, 45.9000007808208, 180.999998480082,
70.5000009462237, 112.099999666214, 11.3000001907349, 88.0999987274408,
103.499998867512, 100.399998664856, 59.9999995827675, 200.699998855591,
21.2999993562698, 47.1999997496605, 42.1999989748001, 58.6000000834465,
161.299998879433, 43.3999999314547, 110.899999141693, 73.9000004529953,
46.7999998703599, 25.7999995350838, 21.1000004559755, 86.100000500679,
15.8999998569489, 5.3999999538064, 143.399998903275), trigger = c(FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE),
geology = c("Sedimentary rocks", "Crystalline basement",
"Sedimentary rocks", "Sedimentary rocks", "Porphyry", "Porphyry",
"Porphyry", "Sedimentary rocks", "Sedimentary rocks", "Crystalline basement",
"Calcschists with ophiolites", "Crystalline basement", "Crystalline basement",
"Sedimentary rocks", "Porphyry", "Porphyry", "Crystalline basement",
"Crystalline basement", "Sedimentary rocks", "Crystalline basement",
"Sedimentary rocks", "Porphyry", "Crystalline basement",
"Crystalline basement", "Crystalline basement", "Crystalline basement",
"Sedimentary rocks", "Porphyry", "Sedimentary rocks", "Crystalline basement",
"Crystalline basement", "Crystalline basement", "Porphyry",
"Calcschists with ophiolites", "Crystalline basement", "Crystalline basement",
"Sedimentary rocks", "Porphyry", "Crystalline basement",
"Porphyry", "Crystalline basement", "Crystalline basement",
"Crystalline basement", "Crystalline basement", "Calcschists with ophiolites",
"Plutonite", "Crystalline basement", "Crystalline basement",
"Porphyry", "Sedimentary rocks")), row.names = c(NA, -50L
), class = c("tbl_df", "tbl", "data.frame"))
and here the test data
structure(list(p3 = c(6.40000009536743, 0, 0, 16.3000003397465,
0, 0, 0, 0, 1, 1.5, 29.1000003814697, 7.49999982118607, 3.5,
18.3999996185303, 2.69999990612268, 11.3000001907349, 0, 2, 9.5,
0, 10.1999998092651, 0, 3.60000005364418, 0, 5.29999995231628,
112.599998474121, 118.099997758865, 54.8999996185303, 72.8000011444092,
79.9000015258789, 88.7000015377998, 0, 54.6000022888184, 144.599998474121,
111.200000762939, 7.10000009834766, 32.0999999046326, 0.5, 5.3999999165535,
0.300000011920929, 0, 36.7999982833862, 101.599998474121, 121.699998855591,
31.0999994277954, 66.8000020980835, 139.200000762939, 9.50000011920929,
135.300003051758, 110.900001525879), p15 = c(12.3999996185303,
63.8000009655952, 20.7000007629395, 121.299998179078, 10.4000001549721,
27.1999999880791, 49.5000003874302, 13.3000001907349, 31.3999998569489,
15.4000002890825, 64.3999997377396, 25.1000001430511, 43.6999994516373,
50.799999833107, 35.1999998092651, 35.1999998837709, 67.1000003442168,
19.400000333786, 49.300000667572, 21.3999996706843, 75.600000411272,
38.700000859797, 30.2999994754791, 14.9000003933907, 53.2000011727214,
137.900000333786, 0.100000001490116, 119.300001859665, 139.700000107288,
147.799997329712, 45.3000004068017, 56.5000000670552, 47.7999995946884,
2.90000009536743, 139.499999403954, 26.6999999284744, 6.5, 149.700001835823,
210.299998342991, 114.499999642372, 3.60000002384186, 60.099999524653,
97.5999984890223, 153.100000120699, 245.299996376038, 123.49999922514,
3.70000004768372, 90.5999985486269, 49.1000001132488, 138.599999785423
), trigger = c(FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE)), row.names = c(NA, -50L), groups = structure(list(
trigger = c(FALSE, TRUE), .rows = structure(list(1:25, 26:50), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
What I now did to fit the logistic regression model trigger ~ p3 + 15
for each geology and get the auroc for each class is the following:
res = train %>% split(., .$geology) %>%
map( ~ glm(trigger ~ p3 + p15, data = .x, family = "binomial")) %>%
map( ~ predict(.x, newdata = test, type = "response")) %>%
map(function(x) {
df = data.frame(ref = test$trigger)
df[["pred"]] = x
df
}) %>% map_dfr(function(x) {
auc = as.numeric(roc(ref ~ pred, data = x)$auc)
}) %>% pivot_longer(cols = everything(), names_to = "geology", values_to="auc")
But there are some parts (the function(x){...}
) that I would like to replace with a more concise purrr
-style syntax. I am having some troubles to figure my head arround the .x
, the .
and when to use the {}
to prevent the result passed into a tibble (maybe necessary at some point).
So how could I achieve the same result, however omitting the function(x)
syntax?