I have a dataframe with a number of different animals (a,b,c in below example data), transactionIDs, counts, and days. I would like to calculate the mean and standard deviation of count values for increasing time windows (specified by days) for each transactionID within each animal. i.e. for transactionID 1 of animal a, I would like to add columns for the means and SDs of i) days -1 to -2, ii) days -1 to -3, iii) days -1 to -4 and so on… so that I end up with 5 new columns with means of increasing time windows and 5 for SDs.
Example data:
> dput(df)
structure(list(Animal = c("a", "a", "a", "a", "a", "a", "a", "a",
"a", "a", "a", "b", "b", "b", "b", "b", "b", "b", "c", "c", "c",
"c", "c"), Count = c(45L, 54L, 22L, 3L, 23L, 46L, 45L, 22L, 67L,
34L, 22L, 34L, 677L, 86L, 54L, 4L, 56L, 98L, 23L, 54L, 22L, 77L,
23L), Day = c(-6L, -5L, -4L, -3L, -2L, -1L, -5L, -4L, -3L, -2L,
-1L, -4L, -3L, -2L, -1L, -3L, -2L, -1L, -6L, -5L, -3L, -2L, -1L
), transactionID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L)), .Names = c("Animal",
"Count", "Day", "transactionID"), class = "data.frame", row.names = c(NA,
-23L))
> df
Animal Count Day transactionID
1 a 45 -6 1
2 a 54 -5 1
3 a 22 -4 1
4 a 3 -3 1
5 a 23 -2 1
6 a 46 -1 1
7 a 45 -5 2
8 a 22 -4 2
9 a 67 -3 2
10 a 34 -2 2
11 a 22 -1 2
12 b 34 -4 3
13 b 677 -3 3
14 b 86 -2 3
15 b 54 -1 3
16 b 4 -3 4
17 b 56 -2 4
18 b 98 -1 4
19 c 23 -6 5
20 c 54 -5 5
21 c 22 -3 5
22 c 77 -2 5
23 c 23 -1 5
I can achieve my desired output using the code below. However, when cycling thorough my entire dataframe there are cases where I have fewer than 6 days for an animal, and this for loop does not add NAs in cases where the maximum no. days in the dataset (and thus the time window) is less than 6. I also have a few cases in my dataset where there is a day missing in the day column (i.e. animal c, day -4). In this case I want to add NAs for the means and sds for all time windows from the missing day onwards. See below for my desired output.
My attempt:
#create empty matrix
res2 = as.data.frame(matrix(NA,0,14))
#split by name
animal.list = split(df,df$Name)
#For loop for
for(i in 1:length(animal.list)){
a = as.data.frame(animal.list[[i]])
animal = unique(a$Name)
#create empty matrix
res = as.data.frame(matrix(NA,0,14))
#create list of event IDs
event = split(a,a$transactionID)
#loop through each event in turn and calculate the mean of different baseline periods (from 2 days to 6 days)- clunky!
for(j in 1:length(event)){
e = as.data.frame(event[[j]])
#max day
e$maxday = unique(e[1,]$Day)
#save mean activity value for the 2 days prior to event
e$mean2d = round(mean(e[e$Day >-3,]$Count),3)
e$SD2d = round(sd(e[e$Day >-3,]$Count),3)
#save mean activity value for the 3 days prior to event
e$mean3d = round(mean(e[e$Day >-4,]$Count),3)
e$SD3d = round(sd(e[e$Day >-4,]$Count),3)
#save mean activity value for the 4 days prior to event
e$mean4d = round(mean(e[e$Day >-5,]$Count),3)
e$SD4d = round(sd(e[e$Day >-5,]$Count),3)
#save mean activity value for the 5 days prior to event
e$mean5d = round(mean(e[e$Day >-6,]$Count),3)
e$SD5d = round(sd(e[e$Day >-6,]$Count),3)
#save mean activity value for the 6 days prior to event
e$mean6d = round(mean(e[e$Day >-7,]$Count),3)
e$SD6d = round(sd(e[e$Day >-7,]$Count),3)
res = rbind(res,e)
}
res2 = rbind(res2,res)
}
Desired output:
>res2
Name Count Day transactionID maxday mean2d SD2d mean3d SD3d mean4d SD4d mean5d SD5d
1 a 45 -6 1 -6 34.5 16.263 24.000 21.517 23.50 17.597 29.6 20.452
2 a 54 -5 1 -6 34.5 16.263 24.000 21.517 23.50 17.597 29.6 20.452
3 a 22 -4 1 -6 34.5 16.263 24.000 21.517 23.50 17.597 29.6 20.452
4 a 3 -3 1 -6 34.5 16.263 24.000 21.517 23.50 17.597 29.6 20.452
5 a 23 -2 1 -6 34.5 16.263 24.000 21.517 23.50 17.597 29.6 20.452
6 a 46 -1 1 -6 34.5 16.263 24.000 21.517 23.50 17.597 29.6 20.452
7 a 45 -5 2 -5 28.0 8.485 41.000 23.302 36.25 21.266 38.0 18.828
8 a 22 -4 2 -5 28.0 8.485 41.000 23.302 36.25 21.266 38.0 18.828
9 a 67 -3 2 -5 28.0 8.485 41.000 23.302 36.25 21.266 38.0 18.828
10 a 34 -2 2 -5 28.0 8.485 41.000 23.302 36.25 21.266 38.0 18.828
11 a 22 -1 2 -5 28.0 8.485 41.000 23.302 36.25 21.266 38.0 18.828
12 b 34 -4 3 -4 70.0 22.627 272.333 350.817 212.75 310.240 NA NA
13 b 677 -3 3 -4 70.0 22.627 272.333 350.817 212.75 310.240 NA NA
14 b 86 -2 3 -4 70.0 22.627 272.333 350.817 212.75 310.240 NA NA
15 b 54 -1 3 -4 70.0 22.627 272.333 350.817 212.75 310.240 NA NA
16 b 4 -3 4 -3 77.0 29.698 52.667 47.089 NA NA NA NA
17 b 56 -2 4 -3 77.0 29.698 52.667 47.089 NA NA NA NA
18 b 98 -1 4 -3 77.0 29.698 52.667 47.089 NA NA NA NA
19 c 23 -6 5 -6 50.0 38.184 NA NA NA NA NA NA
20 c 54 -5 5 -6 50.0 38.184 NA NA NA NA NA NA
21 c 22 -3 5 -6 50.0 38.184 NA NA NA NA NA NA
22 c 77 -2 5 -6 50.0 38.184 NA NA NA NA NA NA
23 c 23 -1 5 -6 50.0 38.184 NA NA NA NA NA NA
mean6d SD6d
1 32.167 19.343
2 32.167 19.343
3 32.167 19.343
4 32.167 19.343
5 32.167 19.343
6 32.167 19.343
7 NA NA
8 NA NA
9 NA NA
10 NA NA
11 NA NA
12 NA NA
13 NA NA
14 NA NA
15 NA NA
16 NA NA
17 NA NA
18 NA NA
19 NA NA
20 NA NA
21 NA NA
22 NA NA
23 NA NA
Edit: based on @Henrik's suggestion (this is a much quicker way to calculate the cumulative means and ads but still does not account for cases where there is a missing day and use Has in these cases) - any simple suggestions would be much appreciated):
library(dplyr)
library(TTR)
#create empty matrix
res2 = as.data.frame(matrix(NA,0,14))
#split by name
animal.list = split(df,df$Name)
#For loop for
for(i in 1:length(animal.list)){
a = as.data.frame(animal.list[[i]])
animal = unique(a$Name)
#create empty matrix
res = as.data.frame(matrix(NA,0,14))
#create list of event IDs
event = split(a,a$transactionID)
#loop through each event in turn and calculate the mean of different baseline periods (from 2 days prior to 10 days prior)
for(j in 1:length(event)){
e = as.data.frame(event[[j]])
#max day
e$maxday = unique(e[1,]$Day)
cmean = cummean(rev(e$Count))
csd= runSD(rev(e$Count),n=1,cumulative=TRUE)
e$mean2d = cmean[2]
e$sd2d = csd[2]
e$mean3d = cmean[3]
e$sd3d = csd[3]
e$mean4d = cmean[4]
e$sd4d = csd[4]
e$mean5d = cmean[5]
e$sd5d = csd[5]
e$mean6d = cmean[6]
e$sd6d = csd[6]
res = rbind(res,e)
}
res2 = rbind(res2,res)
}