using lag results within the same mutate function dplyr

Question

I want to replicate the below formula R using dplyr + lag function. The code works till the 2nd row of each group and then onward gives me 0s

forecast = lag(value,1)*(1-lag(Attrition)/52)

Conditions:

the first value for forecast should be empty as we already have the Value.
second row calculates from the previous values of Attrition and Value columns.
third row onward the previous values should be picked from forecast(not Value column) and attrition columns respectively.

I am getting 0's from 3rd row onward. Below is my code for reproducing.

data <- data %>% group_by(Patch) %>% mutate(id = row_number())
data <- data %>% group_by(Patch) %>% mutate(forecast = lag(Value,1)*(1-lag(Attrition,1)/52))

tbl_df(data)
# A tibble: 12 x 6
   Patch Week       Value Attrition    id forecast
   <chr> <date>     <dbl>     <dbl> <int>    <dbl>
 1 11P11 2021-06-14     2     0.075     1   NA    
 2 11P11 2021-06-21     0     0.075     2    2.00 
 3 11P11 2021-06-28     0     0.075     3    0    
 4 11P12 2021-06-14     3     0.075     1   NA    
 5 11P12 2021-06-21     0     0.075     2    3.00 
 6 11P12 2021-06-28     0     0.075     3    0    
 7 11P12 2021-07-05     0     0.075     4    0    
 8 11P13 2021-06-14     1     0.075     1   NA    
 9 11P13 2021-06-21     0     0.075     2    0.999
10 11P13 2021-06-28     0     0.075     3    0    
11 11P13 2021-07-05     0     0.075     4    0    
12 11P13 2021-07-12     0     0.075     5    0   


> dput(data)
structure(list(Patch = c("11P11", "11P11", "11P11", "11P12", 
"11P12", "11P12", "11P12", "11P13", "11P13", "11P13", "11P13", 
"11P13"), Week = structure(c(18792, 18799, 18806, 18792, 18799, 
18806, 18813, 18792, 18799, 18806, 18813, 18820), class = "Date"), 
    Value = c(2, 0, 0, 3, 0, 0, 0, 1, 0, 0, 0, 0), Attrition = c(0.075, 
    0.075, 0.075, 0.075, 0.075, 0.075, 0.075, 0.075, 0.075, 0.075, 
    0.075, 0.075), id = c(1L, 2L, 3L, 1L, 2L, 3L, 4L, 1L, 2L, 
    3L, 4L, 5L), forecast = c(NA, 1.99711538461538, 0, NA, 2.99567307692308, 
    0, 0, NA, 0.998557692307692, 0, 0, 0)), row.names = c(NA, 
-12L), groups = structure(list(Patch = c("11P11", "11P12", "11P13"
), .rows = structure(list(1:3, 4:7, 8:12), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), row.names = c(NA, -3L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

Please share your datasets in a copy-pastable format with `dput(YOUR_DATASET)` or smaller samples with `dput(head(YOUR_DATASET))`. — ktiu, Jun 17 '21 at 10:18

Anoushiravan R · Answer 1 · 2021-07-25T16:23:01.880

Updated Solution

Here is a simple solution using base::Reduce:

do.call(rbind, lapply(split(df, df$Patch), function(x) {
  x$forecast <- c(NA, Reduce(function(a, b) {
    a * (1 - (x$Attrition[b]/52))
  }, 2:(nrow(x)-1), init = x$Value[1], accumulate = TRUE))
  x
}))

   Patch       Week Value Attrition id  forecast
1  11P11 2021-06-14     2     0.075  1        NA
2  11P11 2021-06-21     0     0.075  2 2.0000000
3  11P11 2021-06-28     0     0.075  3 1.9971154
4  11P12 2021-06-14     3     0.075  1        NA
5  11P12 2021-06-21     0     0.075  2 3.0000000
6  11P12 2021-06-28     0     0.075  3 2.9956731
7  11P12 2021-07-05     0     0.075  4 2.9913524
8  11P13 2021-06-14     1     0.075  1        NA
9  11P13 2021-06-21     0     0.075  2 1.0000000
10 11P13 2021-06-28     0     0.075  3 0.9985577
11 11P13 2021-07-05     0     0.075  4 0.9971175
12 11P13 2021-07-12     0     0.075  5 0.9956793

Earlier Approach

You can also use the following approach. For this I first applied your formula with mutate on your data set to get the first value of my forecast series. Then I sliced the first rows of each group that contains NA values for forecast out. After that I used accumulate function to calculate your desired series using first forecast value as the value for .init argument. Then I bind the resulting data set with the one containing NA values:

library(dplyr)
library(purrr)

df %>%
  group_by(Patch) %>%
  mutate(forecast = lag(Value)*(1-(lag(Attrition)/52))) %>%
  filter(between(row_number(), 2, n())) %>%
  mutate(forecast = accumulate(Attrition[-1], .init = forecast[1], ~ ..1 * (1-(..2/52)))) %>%
  bind_rows(df %>% group_by(Patch) %>%
              mutate(forecast = lag(Value)*(1-(lag(Attrition)/52))) %>%
              slice_head()) %>%
  ungroup() %>%
  arrange(Patch, Week)

# A tibble: 12 x 6
   Patch Week       Value Attrition    id forecast
   <chr> <date>     <dbl>     <dbl> <int>    <dbl>
 1 11P11 2021-06-14     2     0.075     1   NA    
 2 11P11 2021-06-21     0     0.075     2    2.00 
 3 11P11 2021-06-28     0     0.075     3    1.99 
 4 11P12 2021-06-14     3     0.075     1   NA    
 5 11P12 2021-06-21     0     0.075     2    3.00 
 6 11P12 2021-06-28     0     0.075     3    2.99 
 7 11P12 2021-07-05     0     0.075     4    2.99 
 8 11P13 2021-06-14     1     0.075     1   NA    
 9 11P13 2021-06-21     0     0.075     2    0.999
10 11P13 2021-06-28     0     0.075     3    0.997
11 11P13 2021-07-05     0     0.075     4    0.996
12 11P13 2021-07-12     0     0.075     5    0.994

Very cool! I was trying to make `accumulate()` work for this. — ktiu, Jun 17 '21 at 16:09
Thank you very much dear @ktiu . I took a glance at your solution and tried to come up with a different approach and came up with this. Although it may sound strange, it works since the `forecast` value for first rows are always `NA`. Your solution is quite subtle and technical. I am also a big fan of your coding style. — Anoushiravan R, Jun 17 '21 at 16:29

AnilGoyal · Answer 2 · 2021-06-18T06:56:13.400

If I am understanding you correctly, perhaps you need only accumulate from purrr (you don't need lag values but accumulated values instead)-

I calculated FORECAST as per formula given
Used only attrition in argument because we need only first value of Value which we can supply to accumulate through .init
Now resultant vector will be one length more than desired so stripped its last -n() value.
But your further requirement is to have first result as NA, so stripped the result of one more value i.e. first value by subsetting accumulate as [-c(1, n()]
Now concatenated the results with NA in beginning

library(tidyverse)

df %>% group_by(Patch) %>%
  mutate(FORECAST = c(NA, accumulate(Attrition, .init = first(Value), ~ .x * (1 - .y/52))[-c(1, n())]))

#> # A tibble: 12 x 7
#> # Groups:   Patch [3]
#>    Patch Week       Value Attrition    id forecast FORECAST
#>    <chr> <date>     <dbl>     <dbl> <int>    <dbl>    <dbl>
#>  1 11P11 2021-06-14     2     0.075     1   NA       NA    
#>  2 11P11 2021-06-21     0     0.075     2    2.00     2.00 
#>  3 11P11 2021-06-28     0     0.075     3    0        1.99 
#>  4 11P12 2021-06-14     3     0.075     1   NA       NA    
#>  5 11P12 2021-06-21     0     0.075     2    3.00     3.00 
#>  6 11P12 2021-06-28     0     0.075     3    0        2.99 
#>  7 11P12 2021-07-05     0     0.075     4    0        2.98 
#>  8 11P13 2021-06-14     1     0.075     1   NA       NA    
#>  9 11P13 2021-06-21     0     0.075     2    0.999    0.999
#> 10 11P13 2021-06-28     0     0.075     3    0        0.997
#> 11 11P13 2021-07-05     0     0.075     4    0        0.996
#> 12 11P13 2021-07-12     0     0.075     5    0        0.993

^{Created on 2021-06-18 by the reprex package (v2.0.0)}

score 1 · Answer 3 · answered Jun 17 '21 at 15:00

What's tricky about this is that you need to consecutively build the forecast variable, which is why it won't work in a standard mutate() call.

Here is my approach that relies on purrr's map() and reduce() for data consolidation:

library(tidyverse)

data %>%
  mutate(forecast = NA) %>%
  split(~ Patch) %>%
  map(~ .x %>%
          pmap(~ tibble(...)) %>%
          reduce(\(.x, .y) {
            prev <- slice_tail(.x)
            base_value <- ifelse(prev$Value != 0, prev$Value, prev$forecast)
            bind_rows(.x,
                      mutate(.y,
                             forecast = base_value * 1 - prev$Attrition / 5))
          })) %>%
  reduce(bind_rows)

Returns:

# A tibble: 12 x 6
   Patch Week       Value Attrition    id forecast
   <chr> <date>     <dbl>     <dbl> <int>    <dbl>
 1 11P11 2021-06-14     2     0.075     1   NA
 2 11P11 2021-06-21     0     0.075     2    1.98
 3 11P11 2021-06-28     0     0.075     3    1.97
 4 11P12 2021-06-14     3     0.075     1   NA
 5 11P12 2021-06-21     0     0.075     2    2.98
 6 11P12 2021-06-28     0     0.075     3    2.97
 7 11P12 2021-07-05     0     0.075     4    2.95
 8 11P13 2021-06-14     1     0.075     1   NA
 9 11P13 2021-06-21     0     0.075     2    0.985
10 11P13 2021-06-28     0     0.075     3    0.97
11 11P13 2021-07-05     0     0.075     4    0.955
12 11P13 2021-07-12     0     0.075     5    0.94

Data used:

data <- structure(list(Patch = c("11P11", "11P11", "11P11", "11P12", "11P12", "11P12", "11P12", "11P13", "11P13", "11P13", "11P13", "11P13"), Week = structure(c(18792, 18799, 18806, 18792, 18799, 18806, 18813, 18792, 18799, 18806, 18813, 18820), class = "Date"), Value = c(2, 0, 0, 3, 0, 0, 0, 1, 0, 0, 0, 0), Attrition = c(0.075, 0.075, 0.075, 0.075, 0.075, 0.075, 0.075, 0.075, 0.075, 0.075, 0.075, 0.075), id = c(1L, 2L, 3L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 5L), forecast = c(NA, 1.99711538461538, 0, NA, 2.99567307692308, 0, 0, NA, 0.998557692307692, 0, 0, 0)), row.names = c(NA, -12L), groups = structure(list(Patch = c("11P11", "11P12", "11P13"), .rows = structure(list(1:3, 4:7, 8:12), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr", "list"))), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", "tbl_df", "tbl", "data.frame"))

using lag results within the same mutate function dplyr

3 Answers3