1

I have a dataset that looks something like this:

df1 <- data.frame(id = c(rep("A1",4), rep("A2",4)),
                  time = rep(c(0,2:4), 2),
                  y1 = rnorm(8),
                  y2 = rnorm(8))

For each of the y variables, I want to calculate their change since time==0. Basically, I want to do this:

calc_chage <- function(id, data){
  #y1
  y1_0 <- data$y1[which(data$time==0 & data$id==id)]
  D2y1 <- data$y1[which(data$time==2 & data$id==id)] - y1_0
  D3y1 <- data$y1[which(data$time==3 & data$id==id)] - y1_0
  D4y1 <- data$y1[which(data$time==4 & data$id==id)] - y1_0
  #y2
  y2_0 <- data$y2[which(data$time==0 & data$id==id)]
  D2y2 <- data$y2[which(data$time==2 & data$id==id)] - y2_0
  D3y2 <- data$y2[which(data$time==3 & data$id==id)] - y2_0
  D4y2 <- data$y2[which(data$time==4 & data$id==id)] - y2_0
  #Output
  out <- data.frame(id=id, delta=rep(2:4, 2), 
           outcome=c(rep("y1",3), rep("y2",3)),
           change = c(D2y1, D3y1, D4y1,
                      D2y2, D3y2, D4y2))

}

library(purrr)

changes <- map(.x = unique(df1$id), .f = calc_chage, data=df1) %>% 
  map_df(bind_rows)

My guess is that there is a more efficient way of doing this. Alas, I cannot think of it. Suggestions?

Ignacio
  • 7,646
  • 16
  • 60
  • 113
  • In your function there is `data` and `df`. Where is 'df' in the example. I get some error with the `map` code `Error: object of type 'closure' is not subsettable` – akrun Nov 28 '17 at 16:10
  • Sorry about that, df should have been data – Ignacio Nov 28 '17 at 16:26

3 Answers3

2

To calculate the change since time == 0, you can use cumsum + diff; Since the length of summarised result is not equal to one, wrap it in a list first, then unnest, and use gather to transform the result to long format:

library(tidyverse)
df1 %>% 
    group_by(id) %>% 
    summarise_all(~ list(cumsum(diff(.)))) %>% 
    unnest() %>% rename(delta = time) %>% 
    gather(outcome, change, y1:y2) %>% 
    arrange(id) -> changes2

changes2
# A tibble: 12 x 4
#       id delta outcome     change
#   <fctr> <dbl>   <chr>      <dbl>
# 1     A1     2      y1  2.2827244
# 2     A1     3      y1  2.2070326
# 3     A1     4      y1  1.9530212
# 4     A1     2      y2 -2.1263046
# 5     A1     3      y2 -0.5430784
# 6     A1     4      y2 -0.3109535
# 7     A2     2      y1 -1.8587070
# 8     A2     3      y1 -1.1399270
# 9     A2     4      y1  1.5667202
#10     A2     2      y2 -2.0047108
#11     A2     3      y2 -3.4414667
#12     A2     4      y2 -1.3662450

changes$delta <- as.numeric(changes$delta)
changes$outcome <- as.character(changes$outcome)
all.equal(as.data.frame(changes2), changes)
# [1] TRUE
Psidom
  • 209,562
  • 33
  • 339
  • 356
1

If you want to rely on base R functions, I find that aggregate() is a good alternative to the other solutions posted:

res <- aggregate(x = df1$y2, by = list(df1$id), FUN = function(x) x-x[1], 
                 simplify=T)[-1]
data.frame(df1, delta = c(t(res)))

#   id time         y1          y2      delta
# 1 A1    0  0.9176567 -0.70469232  0.0000000
# 2 A1    2 -0.8258515  0.18032808  0.8850204
# 3 A1    3 -0.8144515 -0.39995370  0.3047386
# 4 A1    4  1.5171310 -0.97107643 -0.2663841
# 5 A2    0  0.1900048 -0.01022439  0.0000000
# 6 A2    2 -0.7181630  0.35408157  0.3643060
# 7 A2    3  0.1379936 -0.34336329 -0.3331389
# 8 A2    4  0.4773945  1.38467064  1.3948950
KenHBS
  • 6,756
  • 6
  • 37
  • 52
0

What if you just pulled out the value at t = 0? Can be generalized further for more y values.

For example:

library(dplyr)
t0 <- data %>%
  filter(time == 0) %>%
  mutate(t0_y1 = y1,
          t0.y2 = y2) %>%
  select(-time, -y1, -y2)

data <- data %>%
     left_join(t0) %>%
     mutate(change.y1 = y1 - t0_y1,
            change.y2 = y2 - t0_y2)
user2602640
  • 640
  • 6
  • 21