6

I am sure I am not the only person who has asked this but after hours of searching with no luck I need to ask the question myself.

I have a df (rp) like so:

rp <- structure(list(agec1 = c(7, 16, 11, 11, 17, 17), 
               agec2 = c(6, 12, 9, 9, 16, 15), 
               agec3 = c(2, 9, 9, 9, 14, NA), 
               agec4 = c(NA, 7, 9, 9, 13, NA), 
               agec5 = c(NA, 4, 7, 7, 10, NA), 
               agec6 = c(NA, NA, 6, 6, 9, NA), 
               agec7 = c(NA, NA, NA, NA, 7, NA), 
               agec8 = c(NA, NA, NA, NA, 5, NA), 
          row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))

Where each obs in agecX refers to the age of a parent's children up to 8 children. I would like to create a new column "agec5_12" that contains the age of the oldest child aged 5-12. So my df would look like this:

rpage <- structure(list(agec1 = c(7, 16, 11, 11, 17, 17), 
               agec2 = c(6, 12, 9, 9, 16, 15), 
               agec3 = c(2, 9, 9, 9, 14, NA), 
               agec4 = c(NA, 7, 9, 9, 13, NA), 
               agec5 = c(NA, 4, 7, 7, 10, NA), 
               agec6 = c(NA, NA, 6, 6, 9, NA), 
               agec7 = c(NA, NA, NA, NA, 7, NA), 
               agec8 = c(NA, NA, NA, NA, 5, NA), 
               agec5_12 = c(7, 12, 11, 11, 10, NA))
          row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"))

Notes about my data:

  • Ages are not always in the same chronological order i.e. youngest to oldest or oldest to youngest
  • It is possible for a row to have no children aged within this range (in which case I would like NA to be returned)

I have tried writing a function and applying it using rowwise and mutate:

fun.age5_12 <- function(x){
                 x[which(x == max(x[(x > 4) & (x < 13)], na.rm = TRUE))]
                }
rpage <- rp %>%
         select(-c(20:21, 199:200)) %>%
         rowwise() %>% 
         mutate(agec5_12 = fun.age5_12(c(1:8)))

However, this returns all obs as "12". Ideally I would like to do this using dplyr. Any suggestions using mutate or ifelse and not necessarily with functions are fine.

Thank you

  • 1
    first question! – hrbrmstr Nov 16 '18 at 02:59
  • I think your data might be simpler to generate like this: rp <- data.frame(agec1 = c(7, 16, 11, 11, 17, 17), agec2 = c(6, 12, 9, 9, 16, 15), agec3 = c(2, 9, 9, 9, 14, NA), agec4 = c(NA, 7, 9, 9, 13, NA), agec5 = c(NA, 4, 7, 7, 10, NA), agec6 = c(NA, NA, 6, 6, 9, NA), agec7 = c(NA, NA, NA, NA, 7, NA), agec8 = c(NA, NA, NA, NA, 5, NA)) – dmca Nov 16 '18 at 03:01
  • 1
    @dmca I used `dput` as per instructions from another question. Should I edit my question? – Charlotte Jelleyman Nov 16 '18 at 03:07
  • Your code creates a list. You need a data frame. Take everything inside the `list()` and paste it into `data.frame()` as I did above and you'll be good. – dmca Nov 16 '18 at 04:20

5 Answers5

2

I know you wanted tidyverse but here's one base R way:

data.frame(
  agec1 = c(7, 16, 11, 11, 17, 17), 
  agec2 = c(6, 12, 9, 9, 16, 15), 
  agec3 = c(2, 9, 9, 9, 14, NA), 
  agec4 = c(NA, 7, 9, 9, 13, NA), 
  agec5 = c(NA, 4, 7, 7, 10, NA), 
  agec6 = c(NA, NA, 6, 6, 9, NA), 
  agec7 = c(NA, NA, NA, NA, 7, NA), 
  agec8 = c(NA, NA, NA, NA, 5, NA), 
  stringsAsFactors = FALSE
) -> rp

for (i in 1:nrow(rp)) {
  agec5_12 <- unlist(rp[i,], use.names = FALSE)
  agec5_12 <- agec5_12[agec5_12 >= 5 & agec5_12 <= 12 & !is.na(agec5_12)]
  rp[i, "agec5_12"] <- if (length(agec5_12)) max(agec5_12) else NA_integer_
}

rp
##   agec1 agec2 agec3 agec4 agec5 agec6 agec7 agec8 agec5_12
## 1     7     6     2    NA    NA    NA    NA    NA        7
## 2    16    12     9     7     4    NA    NA    NA       12
## 3    11     9     9     9     7     6    NA    NA       11
## 4    11     9     9     9     7     6    NA    NA       11
## 5    17    16    14    13    10     9     7     5       10
## 6    17    15    NA    NA    NA    NA    NA    NA       NA

The for shows the idiom but an sapply() solution is alot faster:

rp1$agec5_12 <- sapply(1:nrow(rp), function(i) {
  agec5_12 <- unlist(rp[i,], use.names = FALSE)
  agec5_12 <- agec5_12[agec5_12 >= 5 & agec5_12 <= 12 & !is.na(agec5_12)]
  if (length(agec5_12)) max(agec5_12) else NA_integer_
})
hrbrmstr
  • 77,368
  • 11
  • 139
  • 205
1

Another base R solution. We can use replace to replace numbers outside the range of 5 to 12, and then use apply and function(x) ifelse(all(is.na(x)), NA, max(x, na.rm = TRUE)) to find the maximum for each row. You can also consider to use max directly, but for rows with elements are NA, the max function would return -Inf.

rp$agec5_12 <- apply(replace(rp, rp > 12 | rp < 5, NA), 1, 
                     function(x) ifelse(all(is.na(x)), NA, max(x, na.rm = TRUE)))

Or use do.call and pmax.

rp$agec5_12 <- do.call(pmax, c(replace(rp, rp > 12 | rp < 5, NA), na.rm = TRUE))

Here is a performance comparison of the three base R methods so far. do.call with pmax seems to be the fastest one.

library(microbenchmark)

perf <- microbenchmark(
  m1 = {sapply(1:nrow(rp), function(i) {
  agec5_12 <- unlist(rp[i,], use.names = FALSE)
  agec5_12 <- agec5_12[agec5_12 >= 5 & agec5_12 <= 12 & !is.na(agec5_12)]
  if (length(agec5_12)) max(agec5_12) else NA_integer_
})},
  m2 = {
    apply(replace(rp, rp > 12 | rp < 5, NA), 1, 
          function(x) ifelse(all(is.na(x)), NA, max(x, na.rm = TRUE)))
  },
  m3 = {rp$agec5_12 <- do.call(pmax, c(replace(rp, rp > 12 | rp < 5, NA), na.rm = TRUE))
}, times = 1000L) 

perf
# Unit: microseconds
# expr     min       lq     mean  median       uq      max neval cld
#   m1 505.318 559.2935 860.3941 608.386 1231.937 9844.699  1000   b
#   m2 526.394 568.0325 831.6851 629.205 1207.262 4748.342  1000   b
#   m3 384.514 425.1250 635.3154 465.736  918.362 8992.393  1000  a 

DATA

rp <- data.frame(
  agec1 = c(7, 16, 11, 11, 17, 17), 
  agec2 = c(6, 12, 9, 9, 16, 15), 
  agec3 = c(2, 9, 9, 9, 14, NA), 
  agec4 = c(NA, 7, 9, 9, 13, NA), 
  agec5 = c(NA, 4, 7, 7, 10, NA), 
  agec6 = c(NA, NA, 6, 6, 9, NA), 
  agec7 = c(NA, NA, NA, NA, 7, NA), 
  agec8 = c(NA, NA, NA, NA, 5, NA)
) 
www
  • 38,575
  • 12
  • 48
  • 84
1

I think apply solution for such a problem will always be simpler and more readable thandplyr (I am assuming you meant tidyverse) solution but since you asked, here is one way -

library(dplyr)
library(tidyr)

rp %>% 
  rownames_to_column("parent_id") %>% 
  gather(variable, value, -parent_id) %>% 
  group_by(parent_id) %>%
  arrange(parent_id, desc(value)) %>% 
  mutate(
    agec5_12 = value[between(value, 5, 12)][1]
  ) %>%
  ungroup() %>% 
  spread(variable, value) %>% 
  select(3:10, 2)

# A tibble: 6 x 9
  agec1 agec2 agec3 agec4 agec5 agec6 agec7 agec8 agec5_12
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>
1     7     6     2    NA    NA    NA    NA    NA        7
2    16    12     9     7     4    NA    NA    NA       12
3    11     9     9     9     7     6    NA    NA       11
4    11     9     9     9     7     6    NA    NA       11
5    17    16    14    13    10     9     7     5       10
6    17    15    NA    NA    NA    NA    NA    NA       NA
Shree
  • 10,835
  • 1
  • 14
  • 36
1

Since you asked for it, here's a pure dplyr way to do this -

max5_12 <- function(x) {
  a <- sort(x, decreasing = T)
  a[a >= 5 & a <= 12][1]
}

rp %>% 
  t() %>% 
  as.data.frame() %>% 
  bind_rows(
   summarise_all(., max5_12)
  ) %>% 
  t() %>% 
  as.data.frame() %>% 
  setNames(c(names(rp), "agec5_12"))

   agec1 agec2 agec3 agec4 agec5 agec6 agec7 agec8 agec5_12
V1     7     6     2    NA    NA    NA    NA    NA        7
V2    16    12     9     7     4    NA    NA    NA       12
V3    11     9     9     9     7     6    NA    NA       11
V4    11     9     9     9     7     6    NA    NA       11
V5    17    16    14    13    10     9     7     5       10
V6    17    15    NA    NA    NA    NA    NA    NA       NA
Shree
  • 10,835
  • 1
  • 14
  • 36
0

The most straightforward way I can think of to accomplish this uses dplyr, purrr and tidyr:

library(dplyr)
library(purrr)
library(tidyr)
rp %>%
  mutate_at(vars(agec1:agec8), funs(ifelse(between(., 5, 12), ., NA))) %>%%
  group_by(id) %>%
  nest() %>%
  mutate(agec5_12 = map(data, max, na.rm = TRUE),
         agec5_12 = ifelse(agec5_12 == -Inf, NA, agec5_12)) %>%
  unnest()
dmca
  • 675
  • 1
  • 8
  • 18