1

I need to reshape a df, complete it with missing years and create a variable tracking status change. The issue is that some values are missing and the code I wrote brakes down on those.

Toy example:

library(data.table)
df <- data.frame(id=c(1,2),phase_1=c(1994,1994),phase_2=c(1996,1996),phase_3=c(1997,NA))
df1 = melt(df, 
         id.vars = "id",
         measure.vars = c("phase_1", "phase_2", "phase_3"),
         variable.name = "status",
         value.name = "year",
         na.rm = FALSE)
df2 <- df1 %>%  complete(id, year = full_seq(year, 1)) %>% 
  fill(status)

Desired

  id year   phase change
1  1 1994 phase_1      0
2  1 1995 phase_1      0
3  1 1996 phase_2      1
4  1 1997 phase_3      1
5  2 1994 phase_1      0
6  2 1995 phase_1      0
7  2 1996 phase_2      1
8  2 1997 phase_2      0
zx8754
  • 52,746
  • 12
  • 114
  • 209
MCS
  • 1,071
  • 9
  • 23

2 Answers2

2

With dplyr and tidyr, you can also do:

df %>%
 gather(phase, year, -id, na.rm = TRUE) %>%
 complete(id, year = full_seq(year, 1)) %>%
 fill(phase) %>%
 group_by(id) %>%
 mutate(change = as.numeric(phase != lag(phase, default = first(phase))))

     id  year phase   change
  <dbl> <dbl> <chr>    <dbl>
1     1  1994 phase_1      0
2     1  1995 phase_1      0
3     1  1996 phase_2      1
4     1  1997 phase_3      1
5     2  1994 phase_1      0
6     2  1995 phase_1      0
7     2  1996 phase_2      1
8     2  1997 phase_2      0

Or:

 df %>%
 gather(phase, year, -id, na.rm = TRUE) %>%
 complete(id, year = full_seq(year, 1)) %>%
 fill(phase) %>%
 group_by(id) %>%
 mutate(change = (phase != lag(phase, default = first(phase))) * 1)
tmfmnk
  • 38,881
  • 4
  • 47
  • 67
1

You can use dplyr and tidyr as:

library(dplyr)
library(tidyr)
df %>%
  gather(phase, year, phase_1:phase_3) %>%
  filter(!is.na(year)) %>%
  complete(id, year = full_seq(year, 1)) %>%
  mutate(phase = ifelse(is.na(phase), lag(phase,1), phase)) %>%
  group_by(id) %>%  
  mutate(change = ifelse(phase == lag(phase, 1) | row_number() == 1, 0, 1)) 
# A tibble: 8 x 4
# Groups:   id [2]
     id  year phase   change
  <dbl> <dbl> <chr>    <dbl>
1     1  1994 phase_1      0
2     1  1995 phase_1      0
3     1  1996 phase_2      1
4     1  1997 phase_3      1
5     2  1994 phase_1      0
6     2  1995 phase_1      0
7     2  1996 phase_2      1
8     2  1997 phase_2      0
Sonny
  • 3,083
  • 1
  • 11
  • 19
  • The solution works with the toy example provided, though if missing data for consecutive phases are present it does not work. properly. df <- data.frame(id=c(1,2),phase_1=c(1994,1994),phase_2=c(1996,1996),phase_3=c(1997,NA),phase_4=c(1998,NA)) – MCS May 19 '19 at 09:11