I managed to solve the problem by:
- Determining the ranges of the different series of data
- Define the range I want to extrapolate to
- Do the actual extrapolation through the Hmisc package
Initially, I thought I could only manage this by some loops that had to go through the raw data row by row, and was hoping for an existing function.
I'm sure many of you would have coded this way more efficient and nicer. But wanted to post my script anyway for people with a similar problem.
require(Hmisc)
extrapol.length <- 5
test <- data.frame('Time' = c(1:100), # I didn't use this as my data was equally spread in time, if you want to use it, see the first argument in the approxExtrap-function in the secondlast line
'x' = c(rep(NA, 10), 1:30, rep(NA, 30), 1:10, rep(NA, 20)))
## Determine start and end of the continuous (non-NA) data streams
length.values <- diff(c(0, which(is.na(test[,2]))))-2 # length non-NA's
length.values <- length.values[length.values > -1]
length.nas <- diff(c(0, which(!is.na(test[,2])))) # length NA's
length.nas <- length.nas[length.nas > 1]
if(is.na(test[1,2])){
# data starts with NA
length.nas <- data.frame('Order' = seq(1, length(length.nas)*2, by = 2),
'Length' = length.nas, 'Type' = 'na')
length.values <- data.frame('Order' = seq(2, length(length.values)*2, by = 2),
'Length' = length.values, 'Type' = 'value')
start.end <- rbind(length.nas, length.values)
start.end <- start.end[order(start.end$Order),]
value.seqs <- data.frame('no' = c(1:length(start.end$Type[start.end$Type == 'na'])),
'start' = NA, 'end' = NA)
for(a in value.seqs$no){
value.seqs$start[a] <- sum(start.end$Length[1:((a*2)-1)])
value.seqs$end[a] <- sum(start.end$Length[1:(a*2)])
}
}else{
# Data starts with actual values
length.nas <- data.frame('Order' = seq(2, length(length.nas)*2, by = 2),
'Length' = length.nas, 'Type' = 'na')
length.values <- data.frame('Order' = seq(1, length(length.values)*2, by = 2),
'Length' = length.values, 'Type' = 'value')
start.end <- rbind(length.nas, length.values)
start.end <- start.end[order(start.end$Order),]
value.seqs <- data.frame('no' = c(1:length(start.end$Type[start.end$Type == 'value'])),
'start' = c(1,rep(NA, (length(start.end$Type[start.end$Type == 'value'])-1))), 'end' = NA)
for(a in value.seqs$no){
value.seqs$end[a] <- sum(start.end$Length[1:((a*2)-1)])+1
if(a < max(value.seqs$no))
value.seqs$start[a+1] <- sum(start.end$Length[1:(a*2)])+1
}
}
## Do not extrapolate outside of the time-range of the original dataframe
value.seqs$start.extr <- value.seqs$start - extrapol.length
value.seqs$start.extr[value.seqs$start.extr < 1] <- 1 # do not extrapolate below time < 1
value.seqs$end.extr <- value.seqs$end + extrapol.length
value.seqs$end.extr[value.seqs$end.extr > nrow(test) | is.na(value.seqs$end.extr)] <- nrow(test)
value.seqs$end[is.na(value.seqs$end)] <- max(which(!is.na(test[,2])))
## Extrapolate
for(b in value.seqs$no){
test[c(value.seqs$start.extr[b]:value.seqs$end.extr[b]),3] <- approxExtrap(value.seqs$start[b]:value.seqs$end[b],test[c(value.seqs$start[b]:value.seqs$end[b]),2],xout=c(value.seqs$start.extr[b]:value.seqs$end.extr[b]))[2]
}
Thanks for thinking along!