Imagine I have a dataframe as such:
#Stack example
df <- data.frame(DATE = c("2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29",
"2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05",
"2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29",
"2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05",
"2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29",
"2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05",
"2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29", "2022-08-29",
"2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05", "2022-09-05",
"2022-09-05", "2022-08-29", "2022-09-05", "2022-08-29", "2022-09-05", "2022-08-29"),
ID = c("1", "2", "3", "4", "5", "6",
"1", "2", "3", "4", "5", "6",
"1", "2", "3", "4", "5", "6",
"1", "2", "3", "4", "5", "6",
"1", "2", "3", "4", "5", "6",
"1", "2", "3", "4", "5", "6",
"1", "2", "3", "4", "5", "6",
"1", "2", "3", "4", "5", "6",
"7", "8", "9", "10", "9", "10"),
CV = c("SV", "SV", "SV", "SV", "SV", "SV",
"SV", "SV", "SV", "SV", "SV", "SV",
"PD", "PD", "PD", "PD", "PD", "PD",
"PD", "PD", "PD", "PD", "PD", "PD",
"SV", "SV", "SV", "SV", "SV", "SV",
"SV", "SV", "SV", "SV", "SV", "SV",
"PD", "PD", "PD", "PD", "PD", "PD",
"PD", "PD", "PD", "PD", "PD", "PD",
"PD", "PD", "PD", "SV", "PD", "SV"),
TR= c("T1", "T1", "T1", "T1", "T1", "T1",
"T1", "T1", "T1", "T1", "T1", "T1",
"T1", "T1", "T1", "T1", "T1", "T1",
"T1", "T1", "T1", "T1", "T1", "T1",
"T2", "T2", "T2", "T2", "T2", "T2",
"T2", "T2", "T2", "T2", "T2", "T2",
"T2", "T2", "T2", "T2", "T2", "T2",
"T2", "T2", "T2", "T2", "T2", "T2",
"T2", "T2", "T2", "T2", "T2", "T2"),
Values_1 = c(34.9003695, 34.9003695, 28.2389394, 28.2389394, 26.0821875, 26.0821875,
30.5515533, 30.5515533, 22.6469958, 22.6469958, 34.5662974, 34.5662974,
35.2881883, 35.2881883, 41.3885176, 41.3885176, 19.9440042, 19.9440042,
5.6987524, 5.6987524, 37.4641052, 37.4641052, 2.4808126, 2.4808126,
1.7883822, 21.1799057, 21.1799057, 21.1799057, 2.7334442, 2.7334442,
2.7334442, 11.3880187, 11.3880187, 11.3880187, 7.8442267, 7.8442267,
7.8442267, 5.2445510, 5.2445510, 5.2445510, 20.4706600, 20.4706600,
15.7275634, 15.7275634, 4.4575814, 4.4575814, 17.0854186, 17.0854186,
5.6987524, 5.6987524, 37.4641052, 37.4641052, 2.4808126, 2.4808126),
Values_2 = c(76.24359, 76.24359, 58.52421, 58.52421, 80.14131, 80.14131,
59.05000, 102.19699, 102.19699, 72.39848, 72.39848, 58.15000,
68.31217, 68.31217, 53.67941, 53.67941, 56.88980, 56.88980,
108.98399, 96.64207, 96.64207, 38.88542, 38.88542, 54.60000,
52.12500, 52.12500, 17.20875, 17.20875, 47.26923, 47.26923,
67.80738, 60.41250, 60.41250, 83.93404, 83.93404, 37.20336,
50.02500, 50.02500, 94.73309, 94.73309, 41.31748, 41.31748,
56.88344, 59.74702, 59.74702, 48.23750, 48.23750, 95.14831,
108.98399, 96.64207, 96.64207, 38.88542, 38.88542, 54.60000))
For every ID, CV, TR combination that has two timepoints (being "2022-08-29" & "2022-09-05") I would like to subtract the first timepoint values (Values_1 & Values 2) from the last timepoint and return the substracted difference, generating the following output:
TR CV ID Values_1 Values_2
1 T1 PD 1 -29.5894359 40.67182
2 T1 PD 2 -29.5894359 28.32990
3 T1 PD 3 -3.9244124 42.96266
4 T1 PD 4 -3.9244124 -14.79399
5 T1 PD 5 -17.4631916 -18.00438
6 T1 PD 6 -17.4631916 -2.28980
7 T1 SV 1 -4.3488162 -17.19359
8 T1 SV 2 -4.3488162 25.95340
9 T1 SV 3 -5.5919436 43.67278
10 T1 SV 4 -5.5919436 13.87427
11 T1 SV 5 8.4841099 -7.74283
12 T1 SV 6 8.4841099 -21.99131
13 T2 PD 1 7.8833367 6.85844
14 T2 PD 2 10.4830124 9.72202
15 T2 PD 3 -0.7869696 -34.98607
16 T2 PD 4 -0.7869696 -46.49559
17 T2 PD 5 -3.3852414 6.92002
18 T2 PD 6 -3.3852414 53.83083
19 T2 SV 1 0.9450620 15.68238
20 T2 SV 2 -9.7918870 8.28750
21 T2 SV 3 -9.7918870 43.20375
22 T2 SV 4 -9.7918870 66.72529
23 T2 SV 5 5.1107825 36.66481
24 T2 SV 6 5.1107825 -10.06587
I found the following solution, but it does not return the desired result, especially not when there are stray rows or additional duplications because my solution is based on sorting the dataframe and assuming I have a correct sorted dataframe:
#Remove rows that are not duplicated (do not have two timepoints)
dupe = df[,c("ID", "CV", "TR")] # select columns to check duplicates
ONLY_DUPES=df[duplicated(dupe) | duplicated(dupe, fromLast=TRUE),]
#Now we substract last timepoint with first timepoint
#First we melt the data
ONLY_DUPES <- as.data.frame(ONLY_DUPES)
melted<-melt(ONLY_DUPES, id=c("DATE", "ID", "CV", "TR"))
#Then we cast with diff
#First we order the dataframe
melted= melted[with(melted, order(TR, CV, ID, variable, DATE)),]
casted=cast(melted, TR+CV+ID ~variable, fun.aggregate = diff)
#Returns:
TR CV ID Values_1 Values_2
1 T1 PD 1 -29.5894359 40.67182
2 T1 PD 2 -29.5894359 28.32990
3 T1 PD 3 -3.9244124 42.96266
4 T1 PD 4 -3.9244124 -14.79399
5 T1 PD 5 -17.4631916 -18.00438
6 T1 PD 6 -17.4631916 -2.28980
7 T1 SV 1 -4.3488162 -17.19359
8 T1 SV 2 -4.3488162 25.95340
9 T1 SV 3 -5.5919436 43.67278
10 T1 SV 4 -5.5919436 13.87427
11 T1 SV 5 8.4841099 -7.74283
12 T1 SV 6 8.4841099 -21.99131
13 T2 PD 1 7.8833367 6.85844
14 T2 PD 2 10.4830124 9.72202
15 T2 PD 3 -0.7869696 -34.98607
16 T2 PD 4 -0.7869696 -46.49559
17 T2 PD 5 -3.3852414 6.92002
18 T2 PD 6 -3.3852414 53.83083
19 T2 PD 9 -34.9832926 -57.75665 !
20 T2 SV 1 0.9450620 15.68238
21 T2 SV 10 -34.9832926 15.71458 !
22 T2 SV 2 -9.7918870 8.28750
23 T2 SV 3 -9.7918870 43.20375
24 T2 SV 4 -9.7918870 66.72529
25 T2 SV 5 5.1107825 36.66481
26 T2 SV 6 5.1107825 -10.06587
What method can I apply to perform this operation in a fail-proof way?