I have some data which looks like:
date ID var1 var2 var3
<date> <chr> <dbl> <dbl> <dbl>
1 2005-02-22 5D0EAE -0.682 -0.682 -0.682
2 2005-04-29 5D0EAE 0.458 0.458 0.458
3 2005-06-28 80D368 0.178 0.0276 0.0435
4 2005-06-29 80D368 0.563 0.54 0.575
5 2005-07-06 7CCD06 0.36 0.36 0.36
6 2005-07-08 7CCD06 0.64 0.64 0.64
7 2005-07-12 7CCD06 -0.74 NA NA
I have 3 different variables which report data on infrequent dates. I want to use the 'rollapply' function to take the last n
reported values and apply a time series computation to it.
i.e. running:
df %>%
group_by(ID) %>%
summarise(n = n())
Gives,
# A tibble: 3 × 2
ID n
<chr> <int>
1 5D0EAE 2
2 7CCD06 71
3 80D368 29
So the first 'ID' will be "ignored" if n = 5
, the other 2 ID's will apply the rolling function for 5 time periods.
I can pad
the dates using the following:
df2 <- df %>%
complete(date = seq.Date(min(date), max(date), by = "day")) %>% # pad the dates so all companies have daily time series (make the time series complete, filling missing obs with NA's
na_if(0) %>%
arrange(date, ID)
I don't necessarily want to take the last 5 days as is usually in the rollapply function but I want to take the last 5 reported values (by group)
df2 %>%
group_by(ID) %>%
mutate(
myOut = zoo::rollapply(., width = 5, FUN = mean, by = 1, by.column = FALSE)
) # gives an error
So, my question is, how can I apply the zoo
rollapply function to the last 5 reported values for a given ID and not the last 5 reported time periods. Ignoring cases when the number of reported values is less than a threshold (i.e. 5).
Each of the ID's have different lengths.
Data:
df <- structure(list(date = structure(c(12836, 12902, 12962, 12963,
12970, 12972, 12976, 12986, 12989, 12991, 12999, 13000, 13004,
13011, 13020, 13021, 13024, 13032, 13033, 13047, 13049, 13053,
13053, 13054, 13062, 13063, 13068, 13069, 13070, 13073, 13074,
13087, 13090, 13090, 13091, 13096, 13101, 13110, 13117, 13117,
13118, 13119, 13126, 13138, 13139, 13152, 13153, 13154, 13157,
13181, 13182, 13193, 13207, 13207, 13228, 13229, 13230, 13245,
13245, 13250, 13251, 13271, 13272, 13273, 13280, 13300, 13307,
13313, 13315, 13320, 13321, 13339, 13363, 13364, 13383, 13391,
13399, 13410, 13437, 13438, 13440, 13445, 13455, 13459, 13516,
13522, 13552, 13553, 13557, 13558, 13566, 13567, 13571, 13572,
13573, 13574, 13577, 13578, 13580, 13581, 13584, 13585), class = "Date"),
ID = c("5D0EAE", "5D0EAE", "80D368", "80D368", "7CCD06",
"7CCD06", "7CCD06", "80D368", "7CCD06", "7CCD06", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "80D368", "80D368", "80D368",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "80D368",
"80D368", "7CCD06", "7CCD06", "80D368", "80D368", "80D368",
"80D368", "80D368", "7CCD06", "7CCD06", "80D368", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "80D368", "80D368",
"7CCD06", "7CCD06", "80D368", "7CCD06", "80D368", "80D368",
"80D368", "7CCD06", "7CCD06", "7CCD06", "80D368", "7CCD06",
"80D368", "80D368", "80D368", "7CCD06", "7CCD06", "80D368",
"7CCD06", "7CCD06", "80D368", "7CCD06", "7CCD06", "7CCD06",
"80D368", "7CCD06", "7CCD06", "7CCD06", "80D368", "80D368",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06",
"7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06", "7CCD06",
"7CCD06"), var1 = c(-0.681666666666667, 0.458, 0.1784375,
0.563333333333333, 0.36, 0.64, -0.74, 0.94, 0.95, 0.95, -0.0876923076923077,
-0.633333333333333, 0, -0.58, 0.52, 0.61, -0.74, 0.55, 0.55,
0.01, -0.478823529411765, -0.58, -0.74, 0.32, -0.74, 0, 0.32,
0.246666666666667, 0.0000000000000000130678250083694, 0,
0, -0.58, -0.0248, 0.95, -0.428, 0.94, -0.74, 0.94, 0.39,
0.25, 0.61, 0.01, 0, 0.32, 0.65, 0.32, 0.228888888888889,
0.18, 0, 0.112962962962963, 0.256923076923077, 0.94, 0.63,
0, 0.262380952380952, 0.7, 0.7, 0, 0, 0, 0.46, -0.58, 0.27,
0.648, 0.61, 0.305, 0.64, 0.035, 0.7, 0.18037037037037, 0.413333333333333,
0, 0.23, 0.656, 0.55, -0.9, -0.98, -0.58, -0.98, -0.98, -0.58,
-0.98, -0.272068965517241, 0.88, 0.554, 0, -0.30125, -0.4025,
0.62, -0.67, 0.62, 0.62, -0.67, -0.825, 0.62, 0.24, -0.5336364,
-0.08, 0.61, -0.9, -0.5146154, 0.16), var2 = c(-0.681666666666667,
0.458, 0.0276470588235294, 0.54, 0.36, 0.64, NA, 0.94, 0.95,
0.95, -0.0744444444444445, -0.633333333333333, NA, -0.58,
0.514, NA, -0.74, NA, NA, 0.01, -0.37, -0.58, NA, 0.32, -0.74,
0, NA, -0.0825, 0.04625, 0, 0, -0.58, -0.14875, 0.95, -0.35,
0.94, NA, 0.94, 0.295, 0.5, NA, 0.01, NA, 0.32, 0.95, NA,
0.126315789473684, 0.18, 0, 0.281111111111111, 0.256923076923077,
0.94, 0.63, 0, 0.295714285714286, 0.7, 0.7, NA, NA, NA, 0.46,
-0.58, 0.5725, 0.648, 0.61, 0.305, 0.64, 0.69, NA, 0.218846153846154,
0.7, 0, 0.26, 0.656, NA, -0.9, -0.98, -0.58, NA, NA, -0.58,
-0.98, -0.192857142857143, 0.88, 0.56, 0, -0.363913, -0.475,
0.62, NA, 0.62, 0.62, NA, NA, 0.62, 0.2966667, -0.53875,
0.08666667, 0.61, NA, -0.4157143, 0.2), var3 = c(-0.681666666666667,
0.458, 0.0435, 0.575, 0.36, 0.64, NA, 0.94, 0.95, 0.95, -0.00500000000000001,
-0.633333333333333, NA, -0.58, 0.514, NA, -0.74, 0.55, NA,
0.01, -0.317142857142857, -0.58, NA, 0.32, -0.74, 0, NA,
0.123076923076923, 0.04625, 0, 0, -0.58, -0.1025, 0.95, -0.35,
0.94, NA, 0.94, 0.295, 0.5, NA, 0.01, NA, 0.32, 0.75, NA,
0.126315789473684, 0.18, 0, 0.177857142857143, 0.256923076923077,
0.94, 0.63, 0, 0.28551724137931, 0.7, 0.7, 0, NA, NA, 0.46,
-0.58, 0.234285714285714, 0.648, 0.61, 0.305, 0.64, -0.156666666666667,
NA, 0.221333333333333, 0.62, 0, 0.26, 0.656, 0.55, -0.9,
-0.98, -0.58, NA, NA, -0.58, -0.98, -0.18, 0.88, 0.56, 0,
-0.3584615, -0.475, 0.62, NA, 0.62, 0.62, NA, NA, 0.62, 0.2966667,
-0.53875, 0.08666667, 0.61, -0.82, -0.36375, 0.2)), row.names = c(NA,
-102L), class = c("tbl_df", "tbl", "data.frame"))