0

Given:

df <- data.frame(group = c(rep("a", 11),
                           rep("b", 7)),
                 dates = as.Date(c("2019-05-10", "2019-08-23", "2020-08-21", "2021-08-27", "2021-10-04", "2021-12-23", "2022-06-30", "2005-03-07", "2006-03-30", "2006-11-05", "2007-07-14",
                                   "2021-01-21,", "2021-07-06", "2021-09-27", "2021-12-11", "2022-04-28", "2022-07-27", "2022-10-14")),
                 dates_2 = as.Date(c(NA, NA, NA, "2021-08-27", NA, NA, NA, "2005-03-07", NA, NA, NA,
                                     NA, "2021-07-06", NA, NA, NA, "2022-07-27", NA)),
                 counts = c(NA, NA, NA, 3, NA, NA, NA, 1, NA, NA, NA,
                            NA, 5, NA, NA, NA, 4, NA),
                 var = c("go", "go", "stop", NA, "go", "go", "go", NA, "stop", "stop", "go",
                         "go", NA, "suspend", "go", "go", NA, "go"))
df
#    group      dates    dates_2 counts     var
# 1      a 2019-05-10       <NA>     NA      go
# 2      a 2019-08-23       <NA>     NA      go
# 3      a 2020-08-21       <NA>     NA    stop
# 4      a 2021-08-27 2021-08-27      3    <NA>
# 5      a 2021-10-04       <NA>     NA      go
# 6      a 2021-12-23       <NA>     NA      go
# 7      a 2022-06-30       <NA>     NA      go
# 8      a 2005-03-07 2005-03-07      1    <NA>
# 9      a 2006-03-30       <NA>     NA    stop
# 10     a 2006-11-05       <NA>     NA    stop
# 11     a 2007-07-14       <NA>     NA      go
# 12     b 2021-01-21       <NA>     NA      go
# 13     b 2021-07-06 2021-07-06      5    <NA>
# 14     b 2021-09-27       <NA>     NA suspend
# 15     b 2021-12-11       <NA>     NA      go
# 16     b 2022-04-28       <NA>     NA      go
# 17     b 2022-07-27 2022-07-27      4    <NA>
# 18     b 2022-10-14       <NA>     NA      go

I want to collapse/summarise the dataframe by group so that all the NA rows in var collapse by the row above or below it that doesn't contain go. In the event that the row below and above are both go, it will collapse by the top one.

Desired output:

#    group      dates    dates_2 counts     var
# 1      a 2019-05-10       <NA>     NA      go
# 2      a 2019-08-23       <NA>     NA      go
# 3      a 2020-08-21 2021-08-27      3    stop
# 4      a 2021-10-04       <NA>     NA      go
# 5      a 2021-12-23       <NA>     NA      go
# 6      a 2022-06-30       <NA>     NA      go
# 7      a 2006-03-30 2005-03-07      1    stop
# 8      a 2006-11-05       <NA>     NA    stop
# 9      a 2007-07-14       <NA>     NA      go
# 10     b 2021-01-21       <NA>     NA      go
# 11     b 2021-09-27 2021-07-06      5 suspend
# 12     b 2021-12-11       <NA>     NA      go
# 13     b 2022-04-28 2022-07-27      4      go
# 14     b 2022-10-14       <NA>     NA      go

I think something along the lines of this will work but the cumsum here is too simple for my case as flast only works for some groupings:

library(tidyverse)
library(collapse) #working on a large dataset
df <- df %>% 
  group_by(group) %>% 
  mutate(var_indicator = cumsum(!is.na(var)))
df_collapse <- collap(df, ~ group + var_indicator, custom = list(ffirst = c("dates", "var"), 
                                                                 flast = c("dates_2", "counts")))
df_collapse

Maybe there are better approaches?

Thanks

user63230
  • 4,095
  • 21
  • 43
  • 1
    Based on your example, you would just need to compute the lagged `var` value per group and then filter out the NA `var` values. – tmfmnk Jul 24 '23 at 20:45
  • do you mean: `df %>% group_by(group) %>% mutate(var_indicator = lag(var)) %>% filter(!is.na(var))`? That doesn't give the desired output though – user63230 Jul 25 '23 at 10:24

1 Answers1

1

This is what you want:

df |> 
    mutate(across(-c(var, dates), ~ case_when(
        is.na(lead(var)) & var != "go" ~ lead(.), # if the next var is NA and the current var is not "go", fill with the next value
        is.na(var) ~ ., # if the current var is NA, keep values the same
        is.na(lag(var)) & lag(lag(var)) == "go" & var != "go" ~ lag(.), # if the previous var is NA and the var before that one is "go" and the current var is not "go", fill with the previous value
        is.na(lead(var)) & var == "go" & lead(lead(var)) == "go" ~ lead(.), # if the next var is NA and the current var is "go" and the var after the next one is "go", fill with the next value
        TRUE ~ . # everything else, don't change anything
    )), .by = group) |>
    filter(!is.na(var))

# Output:
   group      dates    dates_2 counts     var
1      a 2019-05-10       <NA>     NA      go
2      a 2019-08-23       <NA>     NA      go
3      a 2020-08-21 2021-08-27      3    stop
4      a 2021-10-04       <NA>     NA      go
5      a 2021-12-23       <NA>     NA      go
6      a 2022-06-30       <NA>     NA      go
7      a 2006-03-30 2005-03-07      1    stop
8      a 2006-11-05       <NA>     NA    stop
9      a 2007-07-14       <NA>     NA      go
10     b 2021-01-21       <NA>     NA      go
11     b 2021-09-27 2021-07-06      5 suspend
12     b 2021-12-11       <NA>     NA      go
13     b 2022-04-28 2022-07-27      4      go
14     b 2022-10-14       <NA>     NA      go
Mark
  • 7,785
  • 2
  • 14
  • 34