Probably this is a common question, but I wasn't able to find any answer capable to fit in my problem.
I have data that has a column where some lines are assigned and others are NA.
The NA cases may be assigned if it belongs to a group where has at least one item assigned.
This data has other columns to use to group, and this column has a hierarchy, it means we should group each column once per time in a specific order which an advantage that these group columns are numeric starting from 1
to 0.2
.
This is a kinda depth-first algorithm I think.
structure(list(ID = c("WP_012391491.1/58-334", "WP_045025307.1/57-335",
"WP_065911868.1/57-334", "WP_094130548.1/57-334", "WP_041093274.1/57-335",
"WP_087741863.1/58-335", "WP_048735837.1/58-335", "WP_024526760.1/58-335",
"YP_006375059.1/60-339", "4RK1_A", "WP_081134210.1/58-337", "WP_067481377.1/58-337",
"WP_023519081.1/58-337", "WP_005918334.1/58-332", "WP_011673819.1/58-332",
"WP_101874263.1/58-332", "YP_004891129.1/58-332", "WP_021730312.1/58-332",
"WP_105451130.1/58-332", "WP_105448628.1/58-332", "4RK0_A", "NP_816580.1/58-331",
"WP_014215863.1/58-333", "WP_014074009.1/57-334", "WP_014939645.1/57-334",
"WP_057909529.1/57-334", "WP_035168530.1/57-335", "YP_001328142.1/20-321",
"NP_386672.1/20-321", "NP_437689.1/21-322", "WP_064322056.1/23-326",
"WP_095444766.1/23-326", "WP_022561933.1/19-321", "WP_060691636.1/22-324",
"WP_062000852.1/16-317", "4RY9_A", "WP_011809835.1/29-330", "WP_067873970.1/47-338",
"WP_067195222.1/48-334", "WP_108390182.1/48-337", "WP_088455092.1/48-337",
"WP_013585042.1/48-336", "WP_094181221.1/110-393", "WP_013118380.1/59-348",
"WP_078845346.1/48-338", "WP_058920843.1/26-338", "WP_020937919.1/34-340",
"WP_029381425.1/47-337", "WP_013152136.1/51-349", "WP_079255911.1/49-343",
"WP_078638163.1/49-341", "WP_064731434.1/47-343", "WP_031033051.1/49-344",
"WP_095681865.1/49-343", "WP_079255907.1/54-335", "WP_077275989.1/55-331",
"WP_071455952.1/54-331", "WP_068799469.1/44-336", "WP_108392182.1/49-339",
"WP_012865347.1/56-345", "WP_014104198.1/48-350", "WP_007397977.1/51-337",
"WP_087607280.1/49-337", "WP_028700814.1/24-338", "WP_028700389.1/48-340",
"NP_350138.1/55-335", "3G85_A"), PDB = c(FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
TRUE), ligandId = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, "CL MSE RIB",
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "RIB", NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "GOL TLZ", NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "GOL MSE"), `1` = c(138240,
50687, 1378, 126023, 237469, 124229, 41453, 217657, 91126, 183150,
107179, 6274, 262561, 176386, 148744, 74195, 93951, 264634, 95933,
96095, 183149, 188356, 250907, 252918, 243804, 66122, 207410,
87437, 181530, 187508, 16752, 81323, 265938, 58828, 13021, 638,
638, 28566, 4878, 89140, 122445, 155228, 125955, 164812, 120428,
55374, 257779, 219988, 170251, 116415, 120125, 14694, 208562,
78746, 116429, 115371, 21301, 27608, 88975, 166418, 252468, 197645,
123797, 220728, 220653, 182317, 183297), `0.9` = c(115828, 171070,
47923, 71525, 156529, 98825, 165289, 189247, 31455, 31455, 88210,
61492, 196793, 131804, 143158, 10327, 10327, 10327, 26521, 26521,
31058, 31058, 123582, 121348, 186492, 34988, 162176, 25258, 25258,
138442, 15902, 15902, 196359, 62175, 46251, 147381, 147381, 56135,
59451, 76614, 102076, 130255, 71453, 114643, 96504, 58676, 197451,
192775, 114280, 23991, 23991, 53944, 190660, 69970, 97189, 94703,
43808, 33373, 76685, 111928, 125148, 137147, 98782, 193109, 193143,
26538, 26538), `0.8` = c(110007, 126908, 41980, 81956, 146463,
7523, 7523, 7523, 13169, 13169, 13169, 54211, 144263, 30337,
144486, 68026, 68026, 68026, 69168, 69168, 67041, 67041, 93235,
93313, 91652, 132429, 139245, 64638, 64638, 62222, 81172, 81172,
141549, 44105, 47435, 96620, 96620, 54519, 55356, 70400, 84802,
119491, 82179, 108083, 39098, 131355, 142185, 137368, 106528,
8338, 8338, 8338, 8338, 83518, 39373, 26484, 28621, 51215, 70429,
105638, 23426, 23426, 85745, 137611, 137684, 57248, 57248), `0.7` = c(67870,
102232, 35151, 42345, 92357, 99040, 99040, 99040, 6533, 6533,
6533, 6533, 6533, 20017, 20017, 47416, 47416, 47416, 48130, 48130,
82070, 82070, 73319, 73626, 7056, 7056, 7056, 46979, 46979, 81437,
13586, 13586, 13586, 27032, 26586, 85055, 85055, 13402, 13402,
15242, 15242, 77617, 42498, 67680, 6951, 6951, 6951, 112867,
67415, 12983, 12983, 12983, 12983, 12983, 54691, 23945, 25470,
33887, 49979, 65311, 19335, 19335, 19335, 113208, 113217, 81718,
81718), `0.6` = c(8366, 8366, 23618, 32642, 70323, 78974, 78974,
78974, 1269, 1269, 1269, 1269, 1269, 1269, 1269, 1269, 1269,
1269, 1269, 1269, 1269, 1269, 56028, 56694, 83944, 83944, 83944,
11860, 11860, 11860, 15130, 15130, 15130, 15130, 25412, 64954,
64954, 24654, 24654, 38268, 38268, 59264, 32603, 51690, 10955,
10955, 10955, 10955, 50901, 18836, 18836, 18836, 18836, 18836,
42361, 6461, 6461, 28216, 38284, 50584, 61467, 61467, 61467,
11132, 11132, 62398, 62398), `0.5` = c(568, 568, 568, 568, 568,
568, 568, 568, 28283, 28283, 28283, 28283, 28283, 28283, 28283,
28283, 28283, 28283, 28283, 28283, 28283, 28283, 43941, 44445,
66203, 66203, 66203, 1080, 1080, 1080, 1080, 1080, 1080, 1080,
1080, 1080, 1080, 3884, 3884, 3884, 3884, 3884, 4264, 4264, 8047,
8047, 8047, 8047, 8047, 14712, 14712, 14712, 14712, 14712, 15093,
15671, 15671, 21991, 29850, 39515, 48308, 48308, 48308, 68304,
68304, 48996, 48996), `0.4` = c(2884, 2884, 2884, 2884, 2884,
2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884,
2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884,
809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809,
809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809,
809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 13941,
13941), `0.3` = c(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1561, 1561, 1561, 1561, 1561,
1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561,
1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561,
1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561,
1962, 1962), `0.2` = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -67L))
or
To handle this problem I wrote a function and call it a lot of times.
What I'd like to do this without calling it a lot of time and if possible without the need to write a function.
My solution works but spends a long time processing.
assingn_lig <- function(data, group_col){
data %>%
dplyr::group_by(!!rlang::ensym(group_col)) %>%
mutate(
ligandId = case_when(
PDB == TRUE ~ ligandId ,
!is.na(ligandId) ~ ligandId ,
any( PDB == T ) & is.na(ligandId) ~ paste(unique(unlist(str_split(na.omit(ligandId), " ")) ), collapse = " "),
TRUE ~ NA_character_
)
) %>%
ungroup()
}
tmp2 %>%
assingn_lig(group_col = `1`) %>%
assingn_lig(group_col = `0.9`) %>%
assingn_lig(group_col = `0.8`) %>%
assingn_lig(group_col = `0.7`) %>%
assingn_lig(group_col = `0.6`) %>%
assingn_lig(group_col = `0.5`) %>%
assingn_lig(group_col = `0.4`) %>%
assingn_lig(group_col = `0.3`) %>%
assingn_lig(group_col = `0.2`)
So, does anyone knows a better strategy?
This is the output:
# A tibble: 67 x 12
ID PDB ligandId `1` `0.9` `0.8` `0.7` `0.6` `0.5` `0.4` `0.3` `0.2`
<chr> <lgl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 WP_012391491.1/58-334 FALSE CL MSE RIB 138240 115828 110007 67870 8366 568 2884 9 1
2 WP_045025307.1/57-335 FALSE CL MSE RIB 50687 171070 126908 102232 8366 568 2884 9 1
3 WP_065911868.1/57-334 FALSE CL MSE RIB 1378 47923 41980 35151 23618 568 2884 9 1
4 WP_094130548.1/57-334 FALSE CL MSE RIB 126023 71525 81956 42345 32642 568 2884 9 1
5 WP_041093274.1/57-335 FALSE CL MSE RIB 237469 156529 146463 92357 70323 568 2884 9 1
6 WP_087741863.1/58-335 FALSE CL MSE RIB 124229 98825 7523 99040 78974 568 2884 9 1
7 WP_048735837.1/58-335 FALSE CL MSE RIB 41453 165289 7523 99040 78974 568 2884 9 1
8 WP_024526760.1/58-335 FALSE CL MSE RIB 217657 189247 7523 99040 78974 568 2884 9 1
9 YP_006375059.1/60-339 FALSE CL MSE RIB 91126 31455 13169 6533 1269 28283 2884 9 1
10 4RK1_A TRUE CL MSE RIB 183150 31455 13169 6533 1269 28283 2884 9 1
# ? with 57 more rows
> dput(tmp1)
structure(list(ID = c("WP_012391491.1/58-334", "WP_045025307.1/57-335",
"WP_065911868.1/57-334", "WP_094130548.1/57-334", "WP_041093274.1/57-335",
"WP_087741863.1/58-335", "WP_048735837.1/58-335", "WP_024526760.1/58-335",
"YP_006375059.1/60-339", "4RK1_A", "WP_081134210.1/58-337", "WP_067481377.1/58-337",
"WP_023519081.1/58-337", "WP_005918334.1/58-332", "WP_011673819.1/58-332",
"WP_101874263.1/58-332", "YP_004891129.1/58-332", "WP_021730312.1/58-332",
"WP_105451130.1/58-332", "WP_105448628.1/58-332", "4RK0_A", "NP_816580.1/58-331",
"WP_014215863.1/58-333", "WP_014074009.1/57-334", "WP_014939645.1/57-334",
"WP_057909529.1/57-334", "WP_035168530.1/57-335", "YP_001328142.1/20-321",
"NP_386672.1/20-321", "NP_437689.1/21-322", "WP_064322056.1/23-326",
"WP_095444766.1/23-326", "WP_022561933.1/19-321", "WP_060691636.1/22-324",
"WP_062000852.1/16-317", "4RY9_A", "WP_011809835.1/29-330", "WP_067873970.1/47-338",
"WP_067195222.1/48-334", "WP_108390182.1/48-337", "WP_088455092.1/48-337",
"WP_013585042.1/48-336", "WP_094181221.1/110-393", "WP_013118380.1/59-348",
"WP_078845346.1/48-338", "WP_058920843.1/26-338", "WP_020937919.1/34-340",
"WP_029381425.1/47-337", "WP_013152136.1/51-349", "WP_079255911.1/49-343",
"WP_078638163.1/49-341", "WP_064731434.1/47-343", "WP_031033051.1/49-344",
"WP_095681865.1/49-343", "WP_079255907.1/54-335", "WP_077275989.1/55-331",
"WP_071455952.1/54-331", "WP_068799469.1/44-336", "WP_108392182.1/49-339",
"WP_012865347.1/56-345", "WP_014104198.1/48-350", "WP_007397977.1/51-337",
"WP_087607280.1/49-337", "WP_028700814.1/24-338", "WP_028700389.1/48-340",
"NP_350138.1/55-335", "3G85_A"), PDB = c(FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
TRUE), ligandId = c("CL MSE RIB", "CL MSE RIB", "CL MSE RIB",
"CL MSE RIB", "CL MSE RIB", "CL MSE RIB", "CL MSE RIB", "CL MSE RIB",
"CL MSE RIB", "CL MSE RIB", "CL MSE RIB", "CL MSE RIB", "CL MSE RIB",
"CL MSE RIB", "CL MSE RIB", "CL MSE RIB", "CL MSE RIB", "CL MSE RIB",
"CL MSE RIB", "CL MSE RIB", "RIB", "RIB", "CL MSE RIB", "CL MSE RIB",
"CL MSE RIB", "CL MSE RIB", "CL MSE RIB", "GOL TLZ", "GOL TLZ",
"GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ",
"GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ",
"GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ",
"GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ",
"GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ",
"GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ", "GOL TLZ",
"GOL MSE", "GOL MSE"), `1` = c(138240, 50687, 1378, 126023, 237469,
124229, 41453, 217657, 91126, 183150, 107179, 6274, 262561, 176386,
148744, 74195, 93951, 264634, 95933, 96095, 183149, 188356, 250907,
252918, 243804, 66122, 207410, 87437, 181530, 187508, 16752,
81323, 265938, 58828, 13021, 638, 638, 28566, 4878, 89140, 122445,
155228, 125955, 164812, 120428, 55374, 257779, 219988, 170251,
116415, 120125, 14694, 208562, 78746, 116429, 115371, 21301,
27608, 88975, 166418, 252468, 197645, 123797, 220728, 220653,
182317, 183297), `0.9` = c(115828, 171070, 47923, 71525, 156529,
98825, 165289, 189247, 31455, 31455, 88210, 61492, 196793, 131804,
143158, 10327, 10327, 10327, 26521, 26521, 31058, 31058, 123582,
121348, 186492, 34988, 162176, 25258, 25258, 138442, 15902, 15902,
196359, 62175, 46251, 147381, 147381, 56135, 59451, 76614, 102076,
130255, 71453, 114643, 96504, 58676, 197451, 192775, 114280,
23991, 23991, 53944, 190660, 69970, 97189, 94703, 43808, 33373,
76685, 111928, 125148, 137147, 98782, 193109, 193143, 26538,
26538), `0.8` = c(110007, 126908, 41980, 81956, 146463, 7523,
7523, 7523, 13169, 13169, 13169, 54211, 144263, 30337, 144486,
68026, 68026, 68026, 69168, 69168, 67041, 67041, 93235, 93313,
91652, 132429, 139245, 64638, 64638, 62222, 81172, 81172, 141549,
44105, 47435, 96620, 96620, 54519, 55356, 70400, 84802, 119491,
82179, 108083, 39098, 131355, 142185, 137368, 106528, 8338, 8338,
8338, 8338, 83518, 39373, 26484, 28621, 51215, 70429, 105638,
23426, 23426, 85745, 137611, 137684, 57248, 57248), `0.7` = c(67870,
102232, 35151, 42345, 92357, 99040, 99040, 99040, 6533, 6533,
6533, 6533, 6533, 20017, 20017, 47416, 47416, 47416, 48130, 48130,
82070, 82070, 73319, 73626, 7056, 7056, 7056, 46979, 46979, 81437,
13586, 13586, 13586, 27032, 26586, 85055, 85055, 13402, 13402,
15242, 15242, 77617, 42498, 67680, 6951, 6951, 6951, 112867,
67415, 12983, 12983, 12983, 12983, 12983, 54691, 23945, 25470,
33887, 49979, 65311, 19335, 19335, 19335, 113208, 113217, 81718,
81718), `0.6` = c(8366, 8366, 23618, 32642, 70323, 78974, 78974,
78974, 1269, 1269, 1269, 1269, 1269, 1269, 1269, 1269, 1269,
1269, 1269, 1269, 1269, 1269, 56028, 56694, 83944, 83944, 83944,
11860, 11860, 11860, 15130, 15130, 15130, 15130, 25412, 64954,
64954, 24654, 24654, 38268, 38268, 59264, 32603, 51690, 10955,
10955, 10955, 10955, 50901, 18836, 18836, 18836, 18836, 18836,
42361, 6461, 6461, 28216, 38284, 50584, 61467, 61467, 61467,
11132, 11132, 62398, 62398), `0.5` = c(568, 568, 568, 568, 568,
568, 568, 568, 28283, 28283, 28283, 28283, 28283, 28283, 28283,
28283, 28283, 28283, 28283, 28283, 28283, 28283, 43941, 44445,
66203, 66203, 66203, 1080, 1080, 1080, 1080, 1080, 1080, 1080,
1080, 1080, 1080, 3884, 3884, 3884, 3884, 3884, 4264, 4264, 8047,
8047, 8047, 8047, 8047, 14712, 14712, 14712, 14712, 14712, 15093,
15671, 15671, 21991, 29850, 39515, 48308, 48308, 48308, 68304,
68304, 48996, 48996), `0.4` = c(2884, 2884, 2884, 2884, 2884,
2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884,
2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884, 2884,
809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809,
809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809,
809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 809, 13941,
13941), `0.3` = c(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1561, 1561, 1561, 1561, 1561,
1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561,
1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561,
1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561, 1561,
1962, 1962), `0.2` = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -67L))
Any help is appreciated. Thanks in advance.