3

I have survey data that require several case specific recodes that I need to perform, but I want to avoid creating a new line of code for each and every recode (because there will be dozens). I am hoping there is a way I can recode the data using a crosswalk that only recodes the value for which is required based on the su_id and the Q# that needs the recode.

su_id <- 100001:100010
Q1 <- c(1, 2, 5, 6, 2, 3, 4, 2, 1, 6)
Q2 <- c(2, 4, 6, 4, 3, 6, 2, 1, 6, 5)

data <- data.frame(su_id, Q1, Q2)

su_id <- c( 100004, 100010, 100003, 100006, 100009)
var <- c("Q1", "Q1", "Q2", "Q2", "Q2")
newVal <- c(4, 4, 5, 5, 5)

cw <- data.frame(su_id, var, newVal)

#data: 
    su_id Q1 Q2
1  100001  1  2
2  100002  2  4
3  100003  5  6
4  100004  6  4
5  100005  2  3
6  100006  3  6
7  100007  4  2
8  100008  2  1
9  100009  1  6
10 100010  6  5

#Crosswalk: 
   su_id var newVal
1 100004  Q1      4
2 100010  Q1      4
3 100003  Q2      5
4 100006  Q2      5
5 100009  Q2      5

I started trying to iterate on something like this, but obviously this won't do the trick, but hopefully this gives an idea of what I am trying to accomplish. Can anyone advise on how/if this is possible?

su_idToChange <- cw$su_id
varToChange <- cw$var
newValToChange <- cw$newVal

for(i in su_idToChange) {
  data_new <- data %>%
    mutate(across(all_of(varToChange), case_when(su_id %in% su_idToChange
             ~ coalesce(deframe(cw[cw$var == "Q1" | cw$var == "Q2", ][-1])[.], .))))
}

Thank you!

ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81
Quinterpret
  • 133
  • 6

4 Answers4

4

If I understood your question right, you are trying to perform something like this:

my_change_fun <- function(data, cw) {
  for (i in seq_len(nrow(cw))) {
    
    data[data$su_id == cw[i, 1], cw[i, 2]] <- cw[i, 3]
  }
  data
}

my_change_fun(data, cw)
#>     su_id Q1 Q2
#> 1  100001  1  2
#> 2  100002  2  4
#> 3  100003  5  5
#> 4  100004  4  4
#> 5  100005  2  3
#> 6  100006  3  5
#> 7  100007  4  2
#> 8  100008  2  1
#> 9  100009  1  5
#> 10 100010  4  5

Created on 2021-08-30 by the reprex package (v2.0.1)

Martin Gal
  • 16,640
  • 5
  • 21
  • 39
3

We could also do a left_join with 'cw' based on the 'su_id' columns, then loop across the 'Q1', 'Q2' column, replace where the 'var' column values are matching the column name (cur_column()) by coalesceing the 'newVal' with the column value or else return the column value

library(dplyr)
data %>%
    left_join(cw) %>%
    transmute(su_id, across(c(Q1, Q2), ~ case_when(var %in% cur_column() ~ 
        coalesce(newVal, .), TRUE ~ .)))

-output

   su_id Q1 Q2
1  100001  1  2
2  100002  2  4
3  100003  5  5
4  100004  4  4
5  100005  2  3
6  100006  3  5
7  100007  4  2
8  100008  2  1
9  100009  1  5
10 100010  4  5

Or an option with data.table

library(data.table)
setDT(data)[cw, c("Q1", "Q2") := .(fcoalesce(newVal[match('Q1', 
    var)], Q1),  fcoalesce(newVal[match("Q2", var)], Q2)), 
     on = .(su_id), by = .EACHI]

-output

> data
     su_id Q1 Q2
 1: 100001  1  2
 2: 100002  2  4
 3: 100003  5  5
 4: 100004  4  4
 5: 100005  2  3
 6: 100006  3  5
 7: 100007  4  2
 8: 100008  2  1
 9: 100009  1  5
10: 100010  4  5
akrun
  • 874,273
  • 37
  • 540
  • 662
3

A base R option using merge + reshape

transform(
  merge(
    data,
    reshape(
      cw,
      direction = "wide",
      idvar = "su_id",
      timevar = "var"
    ),
    all = TRUE
  ),
  Q1 = replace(Q1, !is.na(newVal.Q1), na.omit(newVal.Q1)),
  Q2 = replace(Q2, !is.na(newVal.Q2), na.omit(newVal.Q2))
)[names(data)]

gives

    su_id Q1 Q2
1  100001  1  2
2  100002  2  4
3  100003  5  5
4  100004  4  4
5  100005  2  3
6  100006  3  5
7  100007  4  2
8  100008  2  1
9  100009  1  5
10 100010  4  5
ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81
2

Here is a tidyverse solution you could use:

library(dplyr)
library(tidyr)

cw %>%
  pivot_wider(names_from = var, values_from = newVal) %>%
  right_join(data, by = "su_id") %>%
  mutate(across(ends_with(".x"), ~ coalesce(.x, get(gsub("\\.x", "\\.y", cur_column()))))) %>%
  select(!ends_with(".y")) %>%
  arrange(su_id) %>%
  rename_with(~ gsub("\\.x", "", .), ends_with(".x"))

# A tibble: 10 x 3
    su_id    Q1    Q2
    <dbl> <dbl> <dbl>
 1 100001     1     2
 2 100002     2     4
 3 100003     5     5
 4 100004     4     4
 5 100005     2     3
 6 100006     3     5
 7 100007     4     2
 8 100008     2     1
 9 100009     1     5
10 100010     4     5
Anoushiravan R
  • 21,622
  • 3
  • 18
  • 41