4

Let me share an example of what I'm trying to do, since the title may not be as clear as I'd like it to be.

data <- tibble(week=1:10,name=c(rep("Joe",10)),value=c(.9,.89,.99,.98,.87,.89,.93,.92,.98,.9),
               wanted = c("Yes","Skip","No","No","Yes","Skip","Yes","Skip","No","Yes"))

data <- data %>% mutate(my_attempt = case_when( week-lag(week)==1 & 
                                          value < .95 &
                                          lag(value) < .95 &
                                          lag(value,2) >= .95 &
                                          !is.na(lag(value,2))~ "Skip",
                                        week-lag(week)==1 & 
                                          value < .95 &
                                          lag(value) < .95 &
                                          is.na(lag(value,2))~ "Skip",
                                        value < .95 ~"Yes",
                                        TRUE ~ "No"))    

 #   week name  value wanted my_attempt
 #  <int> <chr> <dbl> <chr>  <chr>     
 #     1 Joe    0.9  Yes    Yes       
 #     2 Joe    0.89 Skip   Skip      
 #     3 Joe    0.99 No     No        
 #     4 Joe    0.98 No     No        
 #     5 Joe    0.87 Yes    Yes       
 #     6 Joe    0.89 Skip   Skip      
 #     7 Joe    0.93 Yes    Yes       
 #     8 Joe    0.92 Skip   Yes       
 #     9 Joe    0.98 No     No        
 #    10 Joe    0.9  Yes    Yes    

I am trying to get the my_attempt column to produce the results of the wanted column. I want to identify rows when the value is less than a certain threshold, but there can't be two consecutive "yes" values. My attempt at it works until it sees 4 or more low values in a row. In my real data some weeks may be missing but that can be treated as a "No". For example, if week 6 was missing it would still be okay for week 7 to be "Yes" (I think the first line in my case when takes care of this). Is there a way to do this in R? It doesn't have to be consistent with dplyr but it would be nice if it's possible within tidyverse.

AnilGoyal
  • 25,297
  • 4
  • 27
  • 45

3 Answers3

2

I think you can use purrr:accumulate() here

library(purrr)
library(dplyr)

data%>%mutate(my_attempt = ifelse(week-lag(week, default = 0)==1 & 
                                          value < .95,
                                  'Yes', 'No')%>%
        accumulate(~ifelse(.x==.y & .y=='Yes', 'Skip', .y)))

# A tibble: 10 x 5
    week name  value wanted my_attempt
   <int> <chr> <dbl> <chr>  <chr>     
 1     1 Joe    0.9  Yes    Yes       
 2     2 Joe    0.89 Skip   Skip      
 3     3 Joe    0.99 No     No        
 4     4 Joe    0.98 No     No        
 5     5 Joe    0.87 Yes    Yes       
 6     6 Joe    0.89 Skip   Skip      
 7     7 Joe    0.93 Yes    Yes       
 8     8 Joe    0.92 Skip   Skip      
 9     9 Joe    0.98 No     No        
10    10 Joe    0.9  Yes    Yes 
GuedesBF
  • 8,409
  • 5
  • 19
  • 37
2

I would have done it with rolling computation library like slider where missing data can be indexed well. Showing you on modified data

library(tidyverse)
data <- tibble(week=c(1:5, 7:10),name=c(rep("Joe",9)),value=c(.9,.89,.99,.98,.87,.93,.92,.98,.9),
               wanted = c("Yes","Skip","No","No","Yes","Yes","Skip","No","Yes"))

data
#> # A tibble: 9 x 4
#>    week name  value wanted
#>   <int> <chr> <dbl> <chr> 
#> 1     1 Joe    0.9  Yes   
#> 2     2 Joe    0.89 Skip  
#> 3     3 Joe    0.99 No    
#> 4     4 Joe    0.98 No    
#> 5     5 Joe    0.87 Yes   
#> 6     7 Joe    0.93 Yes   
#> 7     8 Joe    0.92 Skip  
#> 8     9 Joe    0.98 No    
#> 9    10 Joe    0.9  Yes
library(slider)

data %>% group_by(name) %>%
  mutate(wanted2 = case_when(value < 0.95 & slide_index_lgl(.x = value, 
                                                            .i = week, 
                                                            .f = ~  any(.x < 0.95), 
                                                            .before = 1, 
                                                            .after = -1) ~ 'skip',
                             value < 0.95 ~ 'yes',
                             TRUE ~ 'no'))
#> # A tibble: 9 x 5
#> # Groups:   name [1]
#>    week name  value wanted wanted2
#>   <int> <chr> <dbl> <chr>  <chr>  
#> 1     1 Joe    0.9  Yes    yes    
#> 2     2 Joe    0.89 Skip   skip   
#> 3     3 Joe    0.99 No     no     
#> 4     4 Joe    0.98 No     no     
#> 5     5 Joe    0.87 Yes    yes    
#> 6     7 Joe    0.93 Yes    yes    
#> 7     8 Joe    0.92 Skip   skip   
#> 8     9 Joe    0.98 No     no     
#> 9    10 Joe    0.9  Yes    yes

Even it can be done without using slider i.e. in dplyr only

library(dplyr)
data %>% group_by(name) %>%
  mutate(wanted2 = case_when(value < 0.95 & lag(value, default = 1) < 0.95 & week - 1 == lag(week, default = 0) ~ 'Skip',
                             value < 0.95 ~ 'Yes',
                             TRUE ~ 'No'))

#> # A tibble: 9 x 5
#> # Groups:   name [1]
#>    week name  value wanted wanted2
#>   <int> <chr> <dbl> <chr>  <chr>  
#> 1     1 Joe    0.9  Yes    Yes    
#> 2     2 Joe    0.89 Skip   Skip   
#> 3     3 Joe    0.99 No     No     
#> 4     4 Joe    0.98 No     No     
#> 5     5 Joe    0.87 Yes    Yes    
#> 6     7 Joe    0.93 Yes    Yes    
#> 7     8 Joe    0.92 Skip   Skip   
#> 8     9 Joe    0.98 No     No     
#> 9    10 Joe    0.9  Yes    Yes

Created on 2021-07-25 by the reprex package (v2.0.0)

AnilGoyal
  • 25,297
  • 4
  • 27
  • 45
1

Here is a simple dplyr solution:

library(dplyr)

data %>%
  mutate(grp = cummax(week - lag(week, default = 0))) %>%
  group_by(name, grp) %>%
  mutate(my_attempt = ifelse(value < 0.95 & lag(value, default = 1) < 0.95, "Skip", 
                             ifelse(value < 0.95 & lag(value, default = 1) >= 0.95, 
                                    "Yes", "No")))

# A tibble: 9 x 6
# Groups:   name, grp [2]
   week name  value wanted   grp my_attempt
  <int> <chr> <dbl> <chr>  <dbl> <chr>     
1     1 Joe    0.9  Yes        1 Yes       
2     2 Joe    0.89 Skip       1 Skip      
3     3 Joe    0.99 No         1 No        
4     4 Joe    0.98 No         1 No        
5     5 Joe    0.87 Yes        1 Yes       
6     7 Joe    0.93 Yes        2 Yes       
7     8 Joe    0.92 Skip       2 Skip      
8     9 Joe    0.98 No         2 No        
9    10 Joe    0.9  Yes        2 Yes 

And here is how you could do it with base::Reduce on a data set with missing week values. I first created a grouping grp based on the difference between week values and then split the data set based on the grouping variable. After that I applied our function on every chunk and bind the result with rbind:

do.call(rbind, lapply(split(data, cummax(abs(data$week - c(0, data$week[-nrow(data)]))), data$name), 
                      \(x){
                        x$my_attept <- Reduce(function(a, b) {
                          if(x$value[b] < 0.95 & a != "Yes") {
                            "Yes"
                          } else if(x$value[b] < 0.95 & a == "Yes") {
                            "Skip"
                          } else {
                            "No"
                          }
                        }, 2:nrow(x), init = ifelse(x$value[1] < 0.95, "Yes", "No"), accumulate = TRUE)
                        x
                      }))

# A tibble: 9 x 5
   week name  value wanted my_attept
* <int> <chr> <dbl> <chr>  <chr>    
1     1 Joe    0.9  Yes    Yes      
2     2 Joe    0.89 Skip   Skip     
3     3 Joe    0.99 No     No       
4     4 Joe    0.98 No     No       
5     5 Joe    0.87 Yes    Yes      
6     7 Joe    0.93 Yes    Yes      
7     8 Joe    0.92 Skip   Skip     
8     9 Joe    0.98 No     No       
9    10 Joe    0.9  Yes    Yes 

In case you have missing weeks in your data like the modified data set here you can use the following solution. We first group weeks based on their consecutive values and then apply our solution on each group:

data %>%
  mutate(grp = cummax(week - lag(week, default = 0))) %>%
  group_by(name, grp) %>%
  mutate(my_attept = accumulate(value[-1], .init = ifelse(value[1] < 0.95, "Yes", "No"),
                                ~ if(.y < 0.95 & .x != "Yes") {
                                  "Yes"
                                } else if(.y < 0.95 & .x == "Yes") {
                                  "Skip"
                                } else {
                                  "No"
                                }))

# A tibble: 9 x 6
# Groups:   grp [2]
   week name  value wanted   grp my_attept
  <int> <chr> <dbl> <chr>  <dbl> <chr>    
1     1 Joe    0.9  Yes        1 Yes      
2     2 Joe    0.89 Skip       1 Skip     
3     3 Joe    0.99 No         1 No       
4     4 Joe    0.98 No         1 No       
5     5 Joe    0.87 Yes        1 Yes      
6     7 Joe    0.93 Yes        2 Yes      
7     8 Joe    0.92 Skip       2 Skip     
8     9 Joe    0.98 No         2 No       
9    10 Joe    0.9  Yes        2 Yes 

Data

structure(list(week = c(1L, 2L, 3L, 4L, 5L, 7L, 8L, 9L, 10L), 
    name = c("Joe", "Joe", "Joe", "Joe", "Joe", "Joe", "Joe", 
    "Joe", "Joe"), value = c(0.9, 0.89, 0.99, 0.98, 0.87, 0.93, 
    0.92, 0.98, 0.9), wanted = c("Yes", "Skip", "No", "No", "Yes", 
    "Yes", "Skip", "No", "Yes")), row.names = c(NA, -9L), class = c("tbl_df", 
"tbl", "data.frame"))
Anoushiravan R
  • 21,622
  • 3
  • 18
  • 41