Two (massive) tables currently have 'from' and 'to' dates. I want to merge the two tables so that I have every possible set of 'from' and 'to' dates that can be formed from the original dates. For example, if int1 == 0:6, and int2 == 3:9, then I want three intervals: 0:2, 3:6, 7:9.
I have tried foverlaps and manually creating all of the possible date intervals and then merging the data onto that table. The code below shows these failed attempts with toy data. The expected output below should make clear what I would like to accomplish.
The existing tables are huge (millions of ids each with multiple sets of dates for each id).
I'm currently trying a third method ... create an empty table with each id having 1 day (as both the to and from day) per row. The problem with this method is that it is insanely slow given the number of IDs and years that I need to cover. It's been almost 20 hours and my base table is still being created. After that, the plan would be to merge on the existing tables using foverlaps.
I'm losing my hair over this problem and would be grateful for any assistance.
# load packages
library(data.table)
library(lubridate)
# create data
dt1<- data.table(id = rep(1111, 4),
from_date = as.Date(c("2016-01-01", "2016-03-31","2016-09-02", "2016-09-03")),
to_date = as.Date(c("2016-03-15", "2016-09-01", "2016-09-02", "2016-12-15")),
progs = c("a1", "b1", "c1", "d1"))
setkey(dt1, id, from_date, to_date)
dt2<- data.table(id = rep(1111, 4),
from_date = as.Date(c("2016-02-01", "2016-04-01","2016-11-01", "2016-12-01")),
to_date = as.Date(c("2016-02-28", "2016-09-30", "2016-11-30", "2016-12-31")),
progs = c("a2", "b2", "c2", "d2"))
setkey(dt2, id, from_date, to_date)
# expected (hoped for) output
id from_date to_date progs1 prog2
1111 1/1/2016 1/31/2016 a1 NA
1111 2/1/2016 2/28/2016 a1 a2
1111 2/29/2016 3/15/2016 a1 NA
1111 3/31/2016 3/31/2016 b1 NA
1111 4/1/2016 9/1/2016 b1 b2
1111 9/2/2016 9/2/2016 c1 b2
1111 9/3/2016 9/30/2016 d1 b2
1111 10/1/2016 10/31/2016 NA d1
1111 11/1/2016 11/30/2016 d1 c2
1111 12/1/2016 12/15/2016 d1 NA
1111 12/16/2016 12/31/2016 NA d2
# failed attempt #1: using foverlaps
overlaps <- foverlaps(x=dt1, y=dt2,
by.x = c("id", "from_date", "to_date"),
by.y = c("id", "from_date", "to_date"),
type = "any",
mult ="all")
# this does not give every time interval
# failed attempt #2... super convoluted method
# try to make every possible time interval ----
dt <- rbind(dt1[, .(id, from_date)], dt2[, .(id, from_date)])
dt.temp <- rbind(dt1[, .(id, to_date)], dt2[, .(id, to_date)]) # get table with to_dates
setnames(dt.temp, "to_date", "from_date")
dt <- rbind(dt, dt.temp)
rm(dt.temp)
dt <- unique(dt)
setorder(dt, -from_date)
dt[, to_date := as.Date(c(NA, from_date[-.N]), origin = "1970-01-01"), by = "id"]
setorder(dt, from_date)
dt <- dt[!is.na(to_date)] # the last 'from_date' is actually the final to_date, so it doesn't begin a time interval
dt[, counter := 1:.N, by = id] # create indicator so we can know which interval is the first interval for each id
dt[counter != 1, from_date := as.integer(from_date + 1)] # to prevent overlap with previous interval
dt[, counter := NULL]
setkey(dt, id, from_date, to_date)
# merge on dt1 ----
dt <- foverlaps(dt, dt1, type = "any", mult = "all")
dt[, from_date := i.from_date] # when dt1 didn't match, the from_date is NA. fill with i.from_date
dt[, to_date := i.to_date] # when dt2 didn't match, the from_date is NA. fill with i.from_date
dt[, c("i.from_date", "i.to_date") := NULL] # no longer needed
setkey(dt, id, from_date, to_date)
# merge on dt2 ----
dt <- foverlaps(dt, dt2, type = "any", mult = "all")
dt[, from_date := i.from_date] # when dt2 didn't match, the from_date is NA. fill with i.from_date
dt[, to_date := i.to_date] # when dt2 didn't match, the from_date is NA. fill with i.from_date
dt[, c("i.from_date", "i.to_date") := NULL] # no longer needed
setkey(dt, id, from_date, to_date)
setnames(dt, c("i.progs", "progs"), c("progs1", "progs2"))
# Collapse data if dates are contiguous and data are the same ----
# Create unique ID for data chunks ----
dt[, group := .GRP, by = c("id", "progs1", "progs2")] # create group id
dt[, group := cumsum( c(0, diff(group)!=0) )] # in situation like a:a:a:b:b:b:b:a:a:a, want to distinguish first set of "a" from second set of "a"
# Create unique ID for contiguous times within a given data chunk ----
setkey(dt, id, from_date)
dt[, prev_to_date := c(NA, to_date[-.N]), by = "group"]
dt[, diff.prev := from_date - prev_to_date] # difference between from_date & prev_to_date will be 1 (day) if they are contiguous
dt[diff.prev != 1, diff.prev := NA] # set to NA if difference is not 1 day, i.e., it is not contiguous, i.e., it starts a new contiguous chunk
dt[is.na(diff.prev), contig.id := .I] # Give a unique number for each start of a new contiguous chunk (i.e., section starts with NA)
setkey(dt, group, from_date) # need to order the data so the following line will work.
dt[, contig.id := contig.id[1], by= .( group , cumsum(!is.na(contig.id))) ] # fill forward by group
dt[, c("prev_to_date", "diff.prev") := NULL] # drop columns that were just intermediates
# Collapse rows where data chunks are constant and time is contiguous ----
dt[, from_date := min(from_date), by = c("group", "contig.id")]
dt[, to_date := max(to_date), by = c("group", "contig.id")]
dt[, c("group", "contig.id") := NULL]
dt <- unique(dt)
# the end result is incorrect table
id from_date to_date progs2 progs1
1111 1/1/2016 2/28/2016 a2 a1
1111 2/29/2016 3/15/2016 NA a1
1111 3/16/2016 3/31/2016 NA b1
1111 4/1/2016 9/1/2016 b2 b1
1111 9/2/2016 9/2/2016 b2 c1
1111 9/3/2016 9/30/2016 b2 d1
1111 10/1/2016 11/30/2016 c2 d1
1111 12/1/2016 12/15/2016 d2 d1
1111 12/16/2016 12/31/2016 d2 NA
See the expected results and actual results above ... I couldn't display them neatly in a table here.