0

I would use some help with a tidyverse solution for a function that I've custom written. I have a dataset with a binary phenotype and an associated diagnosis date, as well as 2 other dates I hope to use to update and create new variables.

I want to:

  1. update the value of the supplied variables to NA if vardt < other_dt
  2. generate new variables, {var}_incid & {var}_incid_dt, if the vardt variable is before baseline_dt

Here's my go at a function; I know that it likely requires some non-standard evaluation techniques, so I've tried to use assign() and eval(substitute()) around the names to no avail. Any tips? Thanks in advance for the help.

# load lib 
library(tidyverse)
library(lubridate)

rdate <- function(x,
                  min = paste0(format(Sys.Date(), '%Y'), '-01-01'),
                  max = paste0(format(Sys.Date(), '%Y'), '-12-31'),
                  sort = TRUE) {
  
  dates <- sample(seq(as.Date(min), as.Date(max), by = "day"), x, replace = TRUE)
  if (sort == TRUE) {
    sort(dates)
  } else {
    dates
  }
}
# set seed for reproducibility
set.seed(42)

# Beginning dataset
das <- data.frame(id = rep(letters[1:3], each = 5), 
                  pheno = rbinom(n=15, size = 1, prob = 0.30), 
                  pheno_dt = rdate(15), 
                  baseline_dt = rdate(15), 
                  other_dt = rdate(15)) 

update_pheno <- function(var, vardt){
  outds <- das %>% 
  mutate(eval(substitute(var)) = ifelse(var == 1 & pheno_dt < other_dt, NA, var), 
         # update vardt to NA if var value is NA
         vardt = ifelse(is.na(var), NA, vardt)) 
         # create incidence variable based on nomenclature of variable
         paste0(var, "_incid") = ifelse(var == 1 & vardt < baseline_dt, NA, var), 
         # create associated dt variable
         paste0(var, "_incid_dt" = ifelse(is.na(paste0(var, "_incid")), NA, vardt)))  
  
  return(outds)
}

test <- update_pheno(var = pheno, vardt = phenodt)
  • As a start look into [this vignette](https://dplyr.tidyverse.org/articles/programming.html). However, it can be a bit tricky with the updated functions, I couldn't make it work so far. Maybe it's easier if you don't pass symbols into the function but variable names as strings? – starja Aug 17 '22 at 23:32

1 Answers1

0

Limitations, Assumptions, and Simplifications

# Since we're talking *tidyverse*, let's make this a tibble:
das <- as_tibble( das )

das
# A tibble: 15 × 5
   id    pheno pheno_dt   baseline_dt other_dt  
   <chr> <int> <date>     <date>      <date>    
 1 a         1 2022-01-05 2022-01-04  2022-01-03
 2 a         1 2022-01-20 2022-04-19  2022-01-05
 3 a         0 2022-01-24 2022-05-16  2022-02-02
 4 a         1 2022-03-30 2022-05-26  2022-02-09
 5 a         0 2022-04-19 2022-06-07  2022-04-13
 6 b         0 2022-04-20 2022-07-16  2022-04-19
 7 b         1 2022-06-14 2022-08-03  2022-04-24
 8 b         0 2022-07-31 2022-08-14  2022-05-10
 9 b         0 2022-09-16 2022-09-02  2022-05-18
10 b         1 2022-10-10 2022-10-19  2022-07-05
11 c         0 2022-10-24 2022-10-26  2022-08-16
12 c         1 2022-10-25 2022-11-10  2022-09-15
13 c         1 2022-11-10 2022-11-20  2022-09-19
14 c         0 2022-12-14 2022-12-14  2022-11-25
15 c         0 2022-12-26 2022-12-21  2022-12-24
  1. Update the value of the supplied variables to NA if vardt < other_dt
# Do this directly:
das[ das$pheno_dt < das$other_dt   , "pheno" ] <- NA

das
# A tibble: 15 × 5
   id    pheno pheno_dt   baseline_dt other_dt  
   <chr> <int> <date>     <date>      <date>    
 1 a         1 2022-01-05 2022-01-04  2022-01-03
 2 a         1 2022-01-20 2022-04-19  2022-01-05
 3 a        NA 2022-01-24 2022-05-16  2022-02-02
 4 a         1 2022-03-30 2022-05-26  2022-02-09
 5 a         0 2022-04-19 2022-06-07  2022-04-13
 6 b         0 2022-04-20 2022-07-16  2022-04-19
 7 b         1 2022-06-14 2022-08-03  2022-04-24
 8 b         0 2022-07-31 2022-08-14  2022-05-10
 9 b         0 2022-09-16 2022-09-02  2022-05-18
10 b         1 2022-10-10 2022-10-19  2022-07-05
11 c         0 2022-10-24 2022-10-26  2022-08-16
12 c         1 2022-10-25 2022-11-10  2022-09-15
13 c         1 2022-11-10 2022-11-20  2022-09-19
14 c         0 2022-12-14 2022-12-14  2022-11-25
15 c         0 2022-12-26 2022-12-21  2022-12-24
  1. Generate new variables, {var}_incid & {var}_incid_dt, if the vardt variable is before baseline_dt
# What are the names of these new variables?
potential_new_names <- paste0( das$id, '_incid' )

potential_new_names
 [1] "a_incid" "a_incid" "a_incid" "a_incid" "a_incid" "b_incid" "b_incid" "b_incid" "b_incid" "b_incid" "c_incid"
[12] "c_incid" "c_incid" "c_incid" "c_incid"

# To which rows does this apply?
these_rows <- which( das$pheno_dt < das$baseline_dt )

these_rows
 [1]  2  3  4  5  6  7  8 10 11 12 13

# Remove duplicates
new_value_variables <- unique( potential_new_names[ these_rows ] )

# Create corresponding date variables
new_date_variables  <- paste0( new_value_variables, "_dt" )

# Combine value variables and date variables
new_column_names    <- c( new_value_variables, new_date_variables )

new_column_names
[1] "a_incid"    "b_incid"    "c_incid"    "a_incid_dt" "b_incid_dt" "c_incid_dt"

code_to_make_new_columns <- sprintf(
    'das %%>%% mutate( %s )'
  , paste0( new_column_names, "=NA", collapse="," )
)

code_to_make_new_columns
[1] "das %>% mutate( a_incid=NA,b_incid=NA,c_incid=NA,a_incid_dt=NA,b_incid_dt=NA,c_incid_dt=NA )"

new_das <- eval( parse( text = code_to_make_new_columns ))

new_das
# A tibble: 15 × 11
   id    pheno pheno_dt   baseline_dt other_dt   a_incid b_incid c_incid a_incid_dt b_incid_dt c_incid_dt
   <chr> <int> <date>     <date>      <date>     <lgl>   <lgl>   <lgl>   <lgl>      <lgl>      <lgl>     
 1 a         1 2022-01-05 2022-01-04  2022-01-03 NA      NA      NA      NA         NA         NA        
 2 a         1 2022-01-20 2022-04-19  2022-01-05 NA      NA      NA      NA         NA         NA        
 3 a        NA 2022-01-24 2022-05-16  2022-02-02 NA      NA      NA      NA         NA         NA        
 4 a         1 2022-03-30 2022-05-26  2022-02-09 NA      NA      NA      NA         NA         NA        
 5 a         0 2022-04-19 2022-06-07  2022-04-13 NA      NA      NA      NA         NA         NA        
 6 b         0 2022-04-20 2022-07-16  2022-04-19 NA      NA      NA      NA         NA         NA        
 7 b         1 2022-06-14 2022-08-03  2022-04-24 NA      NA      NA      NA         NA         NA        
 8 b         0 2022-07-31 2022-08-14  2022-05-10 NA      NA      NA      NA         NA         NA        
 9 b         0 2022-09-16 2022-09-02  2022-05-18 NA      NA      NA      NA         NA         NA        
10 b         1 2022-10-10 2022-10-19  2022-07-05 NA      NA      NA      NA         NA         NA        
11 c         0 2022-10-24 2022-10-26  2022-08-16 NA      NA      NA      NA         NA         NA        
12 c         1 2022-10-25 2022-11-10  2022-09-15 NA      NA      NA      NA         NA         NA        
13 c         1 2022-11-10 2022-11-20  2022-09-19 NA      NA      NA      NA         NA         NA        
14 c         0 2022-12-14 2022-12-14  2022-11-25 NA      NA      NA      NA         NA         NA        
15 c         0 2022-12-26 2022-12-21  2022-12-24 NA      NA      NA      NA         NA         NA     

Now update the values for the new variables

incident_value_columns <- grep( pattern = "incid$"   , names( new_das ))
incident_date_columns  <- grep( pattern = "incid_dt$", names( new_das ))

rows_to_update <- das$pheno_dt >= das$baseline_dt

new_das[ rows_to_update, incident_value_columns ] <- new_das[ rows_to_update, 'pheno' ]
new_das[ rows_to_update, incident_date_columns  ] <- new_das[ rows_to_update, 'pheno_dt' ]

new_das
# A tibble: 15 × 11
   id    pheno pheno_dt   baseline_dt other_dt   a_incid b_incid c_incid a_incid_dt b_incid_dt c_incid_dt
   <chr> <int> <date>     <date>      <date>       <int>   <int>   <int> <date>     <date>     <date>    
 1 a         1 2022-01-05 2022-01-04  2022-01-03       1       1       1 2022-01-05 2022-01-05 2022-01-05
 2 a         1 2022-01-20 2022-04-19  2022-01-05      NA      NA      NA NA         NA         NA        
 3 a        NA 2022-01-24 2022-05-16  2022-02-02      NA      NA      NA NA         NA         NA        
 4 a         1 2022-03-30 2022-05-26  2022-02-09      NA      NA      NA NA         NA         NA        
 5 a         0 2022-04-19 2022-06-07  2022-04-13      NA      NA      NA NA         NA         NA        
 6 b         0 2022-04-20 2022-07-16  2022-04-19      NA      NA      NA NA         NA         NA        
 7 b         1 2022-06-14 2022-08-03  2022-04-24      NA      NA      NA NA         NA         NA        
 8 b         0 2022-07-31 2022-08-14  2022-05-10      NA      NA      NA NA         NA         NA        
 9 b         0 2022-09-16 2022-09-02  2022-05-18       0       0       0 2022-09-16 2022-09-16 2022-09-16
10 b         1 2022-10-10 2022-10-19  2022-07-05      NA      NA      NA NA         NA         NA        
11 c         0 2022-10-24 2022-10-26  2022-08-16      NA      NA      NA NA         NA         NA        
12 c         1 2022-10-25 2022-11-10  2022-09-15      NA      NA      NA NA         NA         NA        
13 c         1 2022-11-10 2022-11-20  2022-09-19      NA      NA      NA NA         NA         NA        
14 c         0 2022-12-14 2022-12-14  2022-11-25       0       0       0 2022-12-14 2022-12-14 2022-12-14
15 c         0 2022-12-26 2022-12-21  2022-12-24       0       0       0 2022-12-26 2022-12-26 2022-12-26

  1. The non-standard-evaluation part

When you need to access something referenced by a combination of names and values of the parameters passed to a function, eval and sym can be used as follows:

example_within_a_function <- function(
    the_data
  , var_column_name
  , var_datestamp_column_name
  , baseline_column_name
  , other_column_name
){
  # Skip the first argument, which is the function, itself,
  # and get all the rest of the arguments,
  # which are passed parameters

  arguments <- match.call()[ -1 ] %>% as.list
  
  # Extract the value passed to each argument

  values <- seq( arguments ) %>% map_chr( ~rlang::as_string( arguments[[.]] ))
  
  # Return the names of the arguments, their values,
  # the data table (using non-standard evaluation), and
  # the data table (using a straight-forward reference).

  list(
      labels_within_function = names( arguments )
    , labels_in_parent_env   =            values
    , data                   = eval( sym( values[[ 1 ]] ))
    , also_data              = the_data
  )
}

example_within_a_function(
    the_data                  = das
  , var_column_name           = pheno
  , var_datestamp_column_name = pheno_dt
  , baseline_column_name      = baseline_dt
  , other_column_name         = other_dt
)

$labels_within_function
[1] "the_data"                  "var_column_name"           "var_datestamp_column_name" "baseline_column_name"     
[5] "other_column_name"        

$labels_in_parent_env
[1] "das"         "pheno"       "pheno_dt"    "baseline_dt" "other_dt"   

$data
# A tibble: 15 × 5
   id    pheno pheno_dt   baseline_dt other_dt  
   <chr> <int> <date>     <date>      <date>    
 1 a         1 2022-01-05 2022-01-04  2022-01-03
 2 a         1 2022-01-20 2022-04-19  2022-01-05
 3 a        NA 2022-01-24 2022-05-16  2022-02-02
 4 a         1 2022-03-30 2022-05-26  2022-02-09
 5 a         0 2022-04-19 2022-06-07  2022-04-13
 6 b         0 2022-04-20 2022-07-16  2022-04-19
 7 b         1 2022-06-14 2022-08-03  2022-04-24
 8 b         0 2022-07-31 2022-08-14  2022-05-10
 9 b         0 2022-09-16 2022-09-02  2022-05-18
10 b         1 2022-10-10 2022-10-19  2022-07-05
11 c         0 2022-10-24 2022-10-26  2022-08-16
12 c         1 2022-10-25 2022-11-10  2022-09-15
13 c         1 2022-11-10 2022-11-20  2022-09-19
14 c         0 2022-12-14 2022-12-14  2022-11-25
15 c         0 2022-12-26 2022-12-21  2022-12-24

$also_data
# A tibble: 15 × 5
   id    pheno pheno_dt   baseline_dt other_dt  
   <chr> <int> <date>     <date>      <date>    
 1 a         1 2022-01-05 2022-01-04  2022-01-03
 2 a         1 2022-01-20 2022-04-19  2022-01-05
 3 a        NA 2022-01-24 2022-05-16  2022-02-02
 4 a         1 2022-03-30 2022-05-26  2022-02-09
 5 a         0 2022-04-19 2022-06-07  2022-04-13
 6 b         0 2022-04-20 2022-07-16  2022-04-19
 7 b         1 2022-06-14 2022-08-03  2022-04-24
 8 b         0 2022-07-31 2022-08-14  2022-05-10
 9 b         0 2022-09-16 2022-09-02  2022-05-18
10 b         1 2022-10-10 2022-10-19  2022-07-05
11 c         0 2022-10-24 2022-10-26  2022-08-16
12 c         1 2022-10-25 2022-11-10  2022-09-15
13 c         1 2022-11-10 2022-11-20  2022-09-19
14 c         0 2022-12-14 2022-12-14  2022-11-25
15 c         0 2022-12-26 2022-12-21  2022-12-24

Karl Edwards
  • 305
  • 4