I would like to use the tmerge()
function to transform the dataset for use in the Andersen-Gill extension of the Cox regression framework for repeated events. See Therneau's excellent vignette.
I would like to specify that the individual is immune to a repeated event for 30 days after an event, i.e. I would like the individual to exit the risk set temporarily, such that if an event occurs when the individual is not at-risk, it is ignored.
A primitive approach would be to add all the events iteratively and then simply add 30 to tstart
variable. However, this could cause instances were tstart >= tstop
, and would be catastrophic in larger and more complex datasets.
I have attempted to exploit the tmerge()
function with a forloop correcting for the problems I mention above. For this example I will be using the cgd
data in the survival package.
EDIT: See corrected forloop below
library(survival)
cgd0 <- cgd0
newcgd <- tmerge(data1=cgd0[, 1:13], data2=cgd0, id=id, tstop=futime)
for(i in 1:7){
x <- paste0("etime", i) #etime1:etime7
# iteratively add each event
newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))
# select only observations that end in an event and iteratively create
# cumulative number of events for each individual
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1),
id = id, cum_infect = cumtdc(tstop))
# for each loop add 30 days to the start time of the ith cumulative event
newcgd[which(newcgd$cum_infect == i), "tstart"] <-
newcgd[which(newcgd$cum_infect == i), "tstart"] + 30
# for each loop remove observations were the start time >= stop time
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),]
}
attr(newcgd, "tcount")
# early late gap within boundary leading trailing tied
#infect 0 0 0 44 0 0 0 0
#cum_infect 0 0 0 0 44 0 0 0
#infect 0 0 4 11 0 1 1 0
#cum_infect 0 0 0 0 11 0 45 0
#infect 0 0 2 6 0 0 0 0
#cum_infect 0 0 0 0 6 0 56 0
#infect 0 0 1 2 0 0 0 0
#cum_infect 0 0 0 0 6 0 58 0
#infect 0 0 0 2 0 0 0 0
#cum_infect 0 0 0 0 8 0 58 0
#infect 0 0 0 1 0 0 0 0
#cum_infect 0 0 0 0 9 0 58 0
#infect 0 0 0 1 0 0 0 0
#cum_infect 0 0 0 0 10 0 58 0
I believe this solution to be correct. However, this is a common problem in survival analysis and I am concerned that
i) I am overlooking something and the code does not do what I think it does.
ii) I am overlooking a validated way to do this in R
iii) If i) and ii) are not issues, I believe this code to be inefficient and wonder whether there are obvious ways to improve the speed of execution.
-----------------------------------------------------------------------------------------------------------------------------------
EDIT: Further error-checking with comments. Hopefully this clarifies a bit what I am attempting to do. Conceptually; I am specifying that an individual is not at-risk of having another event for 30 days after experiencing an event. In the Andersen-Gill counting process formulation, each line represents an observation that contains a start-time tstart
and a stop-time tstop
and an indicator (in this case infect
) which indicates whether the observation ended due to the event infect == 1
or censoring infect == 0
. Here I manually go through the steps in the forloop above and quantify for each loop how many events take place and the total follow-up time when a 30 day immune period is either specified or not. This same code is then implemented as a forloop for completeness. The results are shown below in a separate code chunk.
newcgd <- tmerge(data1=cgd0[, 1:13], data2=cgd0, id=id, tstop=futime)
###1st event
x <- "etime1"
immunecgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
immunecgd[which(immunecgd$cum_infect == 1), "tstart"] <- immunecgd[which(immunecgd$cum_infect == 1), "tstart"] + 30
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),]
newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
newcgd[which(newcgd$cum_infect == 1), "tstart"] <- newcgd[which(newcgd$cum_infect == 1), "tstart"]
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),]
etime1 <- c(sum(immunecgd$infect), sum(newcgd$infect))
futime1 <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart))
###2nd event
x <- "etime2"
immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x]))
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
immunecgd[which(immunecgd$cum_infect == 2), "tstart"] <- immunecgd[which(immunecgd$cum_infect == 2), "tstart"] + 30
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),]
newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
newcgd[which(newcgd$cum_infect == 2), "tstart"] <- newcgd[which(newcgd$cum_infect == 2), "tstart"]
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),]
etime2 <- c(sum(immunecgd$infect), sum(newcgd$infect))
futime2 <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart))
###3rd event
x <- "etime3"
immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x]))
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
immunecgd[which(immunecgd$cum_infect == 3), "tstart"] <- immunecgd[which(immunecgd$cum_infect == 3), "tstart"] + 30
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),]
newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
newcgd[which(newcgd$cum_infect == 3), "tstart"] <- newcgd[which(newcgd$cum_infect == 3), "tstart"]
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),]
etime3 <- c(sum(immunecgd$infect), sum(newcgd$infect))
futime3 <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart))
###4th event
x <- "etime4"
immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x]))
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
immunecgd[which(immunecgd$cum_infect == 4), "tstart"] <- immunecgd[which(immunecgd$cum_infect == 4), "tstart"] + 30
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),]
newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
newcgd[which(newcgd$cum_infect == 4), "tstart"] <- newcgd[which(newcgd$cum_infect == 4), "tstart"]
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),]
etime4 <- c(sum(immunecgd$infect), sum(newcgd$infect))
futime4 <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart))
###5th event
x <- "etime5"
immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x]))
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
immunecgd[which(immunecgd$cum_infect == 5), "tstart"] <- immunecgd[which(immunecgd$cum_infect == 5), "tstart"] + 30
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),]
newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
newcgd[which(newcgd$cum_infect == 5), "tstart"] <- newcgd[which(newcgd$cum_infect == 5), "tstart"]
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),]
etime5 <- c(sum(immunecgd$infect), sum(newcgd$infect))
futime5 <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart))
###6th event
x <- "etime6"
immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x]))
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
immunecgd[which(immunecgd$cum_infect == 6), "tstart"] <- immunecgd[which(immunecgd$cum_infect == 6), "tstart"] + 30
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),]
newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
newcgd[which(newcgd$cum_infect == 6), "tstart"] <- newcgd[which(newcgd$cum_infect == 6), "tstart"]
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),]
etime6 <- c(sum(immunecgd$infect), sum(newcgd$infect))
futime6 <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart))
###7th event
x <- "etime7"
immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x]))
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
immunecgd[which(immunecgd$cum_infect == 7), "tstart"] <- immunecgd[which(immunecgd$cum_infect == 7), "tstart"] + 30
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),]
newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
newcgd[which(newcgd$cum_infect == 7), "tstart"] <- newcgd[which(newcgd$cum_infect == 7), "tstart"]
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),]
etime7 <- c(sum(immunecgd$infect), sum(newcgd$infect))
futime7 <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart))
df_event <- rbind.data.frame(etime1, etime2, etime3, etime4, etime5, etime6, etime7)
colnames(df_event) <- c("immunity", "no_immunity")
df_event$diff <- df_event$no_immunity - df_event$immunity
df_futime <- rbind.data.frame(futime1, futime2, futime3, futime4, futime5, futime6, futime7)
colnames(df_futime) <- c("immunity", "no_immunity")
df_futime$diff <- df_futime$no_immunity - df_futime$immunity
The same code as a forloop.
newcgd <- tmerge(data1=cgd0[, 1:13], data2=cgd0, id=id, tstop=futime)
immunecgd <- tmerge(data1=cgd0[, 1:13], data2=cgd0, id=id, tstop=futime)
event <- matrix(NA, nrow = 7, ncol = 2)
futime <- matrix(NA, nrow = 7, ncol = 2)
for(i in 1:7){
x <- paste0("etime", i) #etime1:etime7
# iteratively add each event
immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x]))
newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))
# select only observations that end in an event and iteratively create
# cumulative number of events for each individual
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
# for each loop add 30 days to the start time of the ith cumulative event
immunecgd[which(immunecgd$cum_infect == i), "tstart"] <- immunecgd[which(immunecgd$cum_infect == i), "tstart"] + 30
newcgd[which(newcgd$cum_infect == i), "tstart"] <- newcgd[which(newcgd$cum_infect == i), "tstart"]
# for each loop remove observations were the start time >= stop time
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),]
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),]
event[i,] <- c(sum(immunecgd$infect), sum(newcgd$infect))
futime[i,] <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart))
}
event <- data.frame(event)
colnames(event) <- c("immunity", "no_immunity")
event$diff <- event$no_immunity - event$immunity
futime <- data.frame(futime)
colnames(futime) <- c("immunity", "no_immunity")
futime$diff <- futime$no_immunity - futime$immunity
The error-testing code above gives the following results
df_event
immunity no_immunity diff
1 44 44 0
2 56 61 5
3 62 69 7
4 64 72 8
5 66 74 8
6 67 75 8
7 68 76 8
df_futime
immunity no_immunity diff
1 36202 37477 1275
2 35935 37477 1542
3 35875 37477 1602
4 35875 37477 1602
5 35875 37477 1602
6 35875 37477 1602
7 35875 37477 1602
-----------------------------------------------------------------------------------------------------------------------------------
Through further testing on different datasets in the survival
package, simulated datasets and my own personal dataset (the one I am hoping to use this code on), I have discovered a 'glitch'. In the above version of the code, if a new event etime[i-1]
falls into one of the periods were we have specified the individual to be immune to having an event - which is exactly the instance the code is designed to create - the event does not get incorporated into the cumulative event counter cum_infect
. During the next run etime[i]
the individual will only have [i-1] cumulative events, and the portion of the code that controls whether 30 days should be added to the start time
immunecgd[which(immunecgd$cum_infect == i), "tstart"] <- immunecgd[which(immunecgd$cum_infect == i), "tstart"] + 30
will not identify the individual as having had an event. This means that the forloop will only correctly add a 30 day immunity after an event, until the first instance of an event falling on such an immune period. I produced a rather inelegant fix. But it works.
newcgd <- tmerge(data1=cgd0[, 1:13], data2=cgd0, id=id, tstop=futime)
immunecgd <- tmerge(data1=cgd0[, 1:13], data2=cgd0, id=id, tstop=futime)
newcgd$cum_infect_0 <- 0
immunecgd$cum_infect_0 <- 0
event <- matrix(NA, nrow = 7, ncol = 2)
futime <- matrix(NA, nrow = 7, ncol = 2)
for(i in 1:7){
x <- paste0("etime", i) #etime1:etime7
# iteratively add each event
immunecgd <- tmerge(immunecgd, cgd0, id = id, infect = event(cgd0[,x]))
newcgd <- tmerge(newcgd, cgd0, id = id, infect = event(cgd0[,x]))
# select only observations that end in an event and iteratively create
# cumulative number of events for each individual
immunecgd <- tmerge(immunecgd, subset(immunecgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
newcgd <- tmerge(newcgd, subset(newcgd, infect == 1), id = id, cum_infect = cumtdc(tstop))
# create new column that will hold cumulative events between loops
immunecgd[, paste0("cum_infect_", i)] <- immunecgd[, "cum_infect"]
newcgd[, paste0("cum_infect_", i)] <- newcgd[, "cum_infect"]
# for each loop add 30 days to the start time if there is atleast one cumulative event
# and the value of the ith cumulative event is larger than the i-1th cumulative event
immunecgd[which(immunecgd$cum_infect > 0 & immunecgd$cum_infect > immunecgd[, paste0("cum_infect_", i - 1)]), "tstart"] <-
immunecgd[which(immunecgd$cum_infect > 0 & immunecgd$cum_infect > immunecgd[, paste0("cum_infect_", i - 1)]), "tstart"] + 30
newcgd[which(newcgd$cum_infect > 0 & newcgd$cum_infect > newcgd[, paste0("cum_infect_", i - 1)]), "tstart"] <-
newcgd[which(newcgd$cum_infect > 0 & newcgd$cum_infect > newcgd[, paste0("cum_infect_", i - 1)]), "tstart"]
# for each loop remove observations were the start time >= stop time
immunecgd <- immunecgd[which(immunecgd$tstart < immunecgd$tstop),]
newcgd <- newcgd[which(newcgd$tstart < newcgd$tstop),]
event[i,] <- c(sum(immunecgd$infect), sum(newcgd$infect))
futime[i,] <- c(sum(immunecgd$tstop - immunecgd$tstart), sum(newcgd$tstop - newcgd$tstart))
}
immunecgd <- immunecgd[,!grepl("cum_infect_", colnames(immunecgd))]
newcgd <- newcgd[,!grepl("cum_infect_", colnames(newcgd))]
event <- data.frame(event)
colnames(event) <- c("immunity", "no_immunity")
event$diff <- event$no_immunity - event$immunity
futime <- data.frame(futime)
colnames(futime) <- c("immunity", "no_immunity")
futime$diff <- futime$no_immunity - futime$immunity
Here we can see the difference in the total number of events
immunity no_immunity diff
1 44 44 0
2 56 61 5
3 62 69 7
4 64 72 8
5 65 74 9
6 66 75 9
7 66 76 10
Correctly specifying the forloop has found 2 more instances were an event fell on an immune period.