2

Two (massive) tables currently have 'from' and 'to' dates. I want to merge the two tables so that I have every possible set of 'from' and 'to' dates that can be formed from the original dates. For example, if int1 == 0:6, and int2 == 3:9, then I want three intervals: 0:2, 3:6, 7:9.

I have tried foverlaps and manually creating all of the possible date intervals and then merging the data onto that table. The code below shows these failed attempts with toy data. The expected output below should make clear what I would like to accomplish.

The existing tables are huge (millions of ids each with multiple sets of dates for each id).

I'm currently trying a third method ... create an empty table with each id having 1 day (as both the to and from day) per row. The problem with this method is that it is insanely slow given the number of IDs and years that I need to cover. It's been almost 20 hours and my base table is still being created. After that, the plan would be to merge on the existing tables using foverlaps.

I'm losing my hair over this problem and would be grateful for any assistance.

# load packages
library(data.table)
library(lubridate)
# create data
dt1<- data.table(id = rep(1111, 4),
           from_date = as.Date(c("2016-01-01", "2016-03-31","2016-09-02", "2016-09-03")), 
           to_date = as.Date(c("2016-03-15", "2016-09-01", "2016-09-02", "2016-12-15")), 
           progs = c("a1", "b1", "c1", "d1"))
setkey(dt1, id, from_date, to_date)    

dt2<- data.table(id = rep(1111, 4),
           from_date = as.Date(c("2016-02-01", "2016-04-01","2016-11-01", "2016-12-01")), 
           to_date = as.Date(c("2016-02-28", "2016-09-30", "2016-11-30", "2016-12-31")), 
           progs = c("a2", "b2", "c2", "d2"))
setkey(dt2, id, from_date, to_date)    

# expected (hoped for) output
id  from_date   to_date progs1  prog2
1111    1/1/2016    1/31/2016   a1  NA
1111    2/1/2016    2/28/2016   a1  a2
1111    2/29/2016   3/15/2016   a1  NA
1111    3/31/2016   3/31/2016   b1  NA
1111    4/1/2016    9/1/2016    b1  b2
1111    9/2/2016    9/2/2016    c1  b2
1111    9/3/2016    9/30/2016   d1  b2
1111    10/1/2016   10/31/2016  NA  d1
1111    11/1/2016   11/30/2016  d1  c2
1111    12/1/2016   12/15/2016  d1  NA
1111    12/16/2016  12/31/2016  NA  d2

# failed attempt #1: using foverlaps
overlaps <- foverlaps(x=dt1, y=dt2, 
                by.x = c("id", "from_date", "to_date"),
                by.y = c("id", "from_date", "to_date"), 
                type = "any", 
                mult ="all")
# this does not give every time interval    

# failed attempt #2... super convoluted method
# try to make every possible time interval ----
dt <- rbind(dt1[, .(id, from_date)], dt2[, .(id, from_date)]) 
dt.temp <- rbind(dt1[, .(id, to_date)], dt2[, .(id, to_date)]) # get table with to_dates
setnames(dt.temp, "to_date", "from_date") 
dt <- rbind(dt, dt.temp)
rm(dt.temp)
dt <- unique(dt)
setorder(dt, -from_date)
dt[, to_date := as.Date(c(NA, from_date[-.N]), origin = "1970-01-01"), by = "id"]
setorder(dt, from_date)
dt <- dt[!is.na(to_date)] # the last 'from_date' is actually the final to_date, so it doesn't begin a time interval
dt[, counter := 1:.N, by = id] # create indicator so we can know which interval is the first interval for each id
dt[counter != 1, from_date := as.integer(from_date + 1)] # to prevent overlap with previous interval
dt[, counter := NULL]
setkey(dt, id, from_date, to_date)    

# merge on dt1 ----
dt <- foverlaps(dt, dt1, type = "any", mult = "all")
dt[, from_date := i.from_date] # when dt1 didn't match, the from_date is NA. fill with i.from_date
dt[, to_date := i.to_date] # when dt2 didn't match, the from_date is NA. fill with i.from_date
dt[, c("i.from_date", "i.to_date") := NULL] # no longer needed
setkey(dt, id, from_date, to_date)    

# merge on dt2 ----
dt <- foverlaps(dt, dt2, type = "any", mult = "all")
dt[, from_date := i.from_date] # when dt2 didn't match, the from_date is NA. fill with i.from_date
dt[, to_date := i.to_date] # when dt2 didn't match, the from_date is NA. fill with i.from_date
dt[, c("i.from_date", "i.to_date") := NULL] # no longer needed
setkey(dt, id, from_date, to_date)    

setnames(dt, c("i.progs", "progs"), c("progs1", "progs2"))    

# Collapse data if dates are contiguous and data are the same ----
# Create unique ID for data chunks ----
dt[, group := .GRP, by = c("id", "progs1", "progs2")] # create group id
dt[, group := cumsum( c(0, diff(group)!=0) )] # in situation like a:a:a:b:b:b:b:a:a:a, want to distinguish first set of "a" from second set of "a"    

# Create unique ID for contiguous times within a given data chunk ----
setkey(dt, id, from_date)
dt[, prev_to_date := c(NA, to_date[-.N]), by = "group"]
dt[, diff.prev := from_date - prev_to_date] # difference between from_date & prev_to_date will be 1 (day) if they are contiguous
dt[diff.prev != 1, diff.prev := NA] # set to NA if difference is not 1 day, i.e., it is not contiguous, i.e., it starts a new contiguous chunk
dt[is.na(diff.prev), contig.id := .I] # Give a unique number for each start of a new contiguous chunk (i.e., section starts with NA)
setkey(dt, group, from_date) # need to order the data so the following line will work.
dt[, contig.id  := contig.id[1], by=  .( group , cumsum(!is.na(contig.id))) ] # fill forward by group
dt[, c("prev_to_date", "diff.prev") := NULL] # drop columns that were just intermediates    

# Collapse rows where data chunks are constant and time is contiguous ----      
dt[, from_date := min(from_date), by = c("group", "contig.id")]
dt[, to_date := max(to_date), by = c("group", "contig.id")]
dt[, c("group", "contig.id") := NULL]
dt <- unique(dt)      

# the end result is incorrect table
id  from_date   to_date progs2  progs1
1111    1/1/2016    2/28/2016   a2  a1
1111    2/29/2016   3/15/2016   NA  a1
1111    3/16/2016   3/31/2016   NA  b1
1111    4/1/2016    9/1/2016    b2  b1
1111    9/2/2016    9/2/2016    b2  c1
1111    9/3/2016    9/30/2016   b2  d1
1111    10/1/2016   11/30/2016  c2  d1
1111    12/1/2016   12/15/2016  d2  d1
1111    12/16/2016  12/31/2016  d2  NA

See the expected results and actual results above ... I couldn't display them neatly in a table here.

qdan
  • 45
  • 6
  • I wish I had a solution for you. This is something I came across and wondered if it might be useful or could be adapted for your needs: https://stackoverflow.com/a/8887830/3460670 – Ben Oct 13 '19 at 01:08
  • Thanks for your suggestion Ben. I checked out the page and it looks like they are trying to accomplish the same thing. Unfortunately, when I run it with my sample code, it's not 100% accurate. However, I hadn't thought of using SQLDF, so I'll look into that. Thanks again. – qdan Oct 14 '19 at 17:09
  • you have 2016-12-16 and 2016-02-29 (Y-m-d) as `from_date` in your expected output, but I don't find them in either `from` or `to` variables of `dt1` or `dt2`. Where did they come from? – PavoDive Oct 14 '19 at 22:52
  • Thanks @PavoDive for your question. The 2016-12-16 is the day after the last to_date in dt1 and spans to 12/31/2016, which is the last to_date in dt2. In other words, is is the portion of the last two intervals that do not overlap. The 2016-02-29 is the day after the first to_date in dt, and overlaps with the first interval from dt1. In a minute I'm going to post another solution that I came up with this morning that i hope will make it clearer. Thanks! – qdan Oct 16 '19 at 02:56

4 Answers4

1

Not 100% sure of what you are attempting to do, however, there is a function called crossing which can get you all the permutations across multiple vectors.


> library(tidyr)
> a <- c("2016-01-01", "2016-03-31","2016-09-02", "2016-09-03")
> b <- c("2016-03-15", "2016-09-01", "2016-09-02", "2016-12-15")
> c <- rep(1111, 4)
> crossing(a, b,c)

# A tibble: 16 x 3
   a          b              c
   <chr>      <chr>      <dbl>
 1 2016-01-01 2016-03-15  1111
 2 2016-01-01 2016-09-01  1111
 3 2016-01-01 2016-09-02  1111
 4 2016-01-01 2016-12-15  1111
 5 2016-03-31 2016-03-15  1111
 6 2016-03-31 2016-09-01  1111
 7 2016-03-31 2016-09-02  1111
 8 2016-03-31 2016-12-15  1111
 9 2016-09-02 2016-03-15  1111
10 2016-09-02 2016-09-01  1111
11 2016-09-02 2016-09-02  1111
12 2016-09-02 2016-12-15  1111
13 2016-09-03 2016-03-15  1111
14 2016-09-03 2016-09-01  1111
15 2016-09-03 2016-09-02  1111
16 2016-09-03 2016-12-15  1111

Would this be something along the lines if what you are attempting to achieve?

dswdsyd
  • 576
  • 4
  • 12
  • Thank you for your suggestion. This isn't exactly what I was looking for. I'm looking to generate every possible sequential interval based on the intervals in two tables. Basically, I want the intervals where there is only data from the first table, intervals where the tables overlap, and intervals that only exist in the second table. I don't know how to explain this, but I hope the data set up and the expected (hoped for) output embedded near the top of the code can make it clear. Thanks again. – qdan Oct 13 '19 at 00:24
1

Although @GenericNameNumber answered this question, I found another way to solve my problem that might be easier to understand (although it is memory inefficient compared to the accepted answer). This should take just a few seconds to run if you want to try it.

If anyone has an idea for simple yet efficient code, I'm all ears!

# load packages ----
  library(data.table)    

# create data ----
  rm(list=ls())
  dt1<- data.table(id = rep(1111, 4),
                     from_date = as.Date(c("2016-01-01", "2016-03-31","2016-09-02", "2016-09-03")), 
                     to_date = as.Date(c("2016-03-15", "2016-09-01", "2016-09-02", "2016-12-15")), 
                     progs = c("a1", "b1", "c1", "d1"))
  setkey(dt1, id, from_date, to_date)    

  dt2<- data.table(id = rep(1111, 4),
                     from_date = as.Date(c("2016-02-01", "2016-04-01","2016-11-01", "2016-12-01")), 
                     to_date = as.Date(c("2016-02-28", "2016-09-30", "2016-11-30", "2016-12-31")), 
                     progs = c("a2", "b2", "c2", "d2"))
  setkey(dt2, id, from_date, to_date)        



# Create table with 'intervals' of 1 day duration ----
  dt <- rbind(dt1[,1:3], dt2[,1:3])
  dt[, reps := (to_date - from_date) + 1] # identify the number of days per interval (add one because dates are inclusive)
  dt <- dt[rep(1:.N,reps)] # replicate each row to make 1 row per day of each interval
  dt[,counter:=(1:.N-1),by=c("id", "from_date")] # add a counter (aka index number) for each from date per id
  dt[, c("from_date", "to_date") := from_date + counter] # create intervals of 1 day
  dt[, c("reps", "counter") := NULL] # drop columns no longer needed
  dt <- unique(dt) # de-duplicate rows so each day only appears once
  setkey(dt, id, from_date)    

# merge on dt1 ----
    dt <- foverlaps(x=dt, y=dt1, 
                    by.x = c("id", "from_date", "to_date"), 
                    by.y = c("id", "from_date", "to_date"), 
                    type = "any", mult = "all")
    dt <- dt[, c("from_date", "to_date") := NULL] # drop intervals from dt1 because will use the intervals from dt for merging on dt2 below
    setnames(dt, c("i.from_date", "i.to_date"), c("from_date", "to_date") )
    setcolorder(dt, c("id", "from_date", "to_date"))
    setkey(dt, id, from_date, to_date)    

# merge on dt2 ----
    dt <- foverlaps(x=dt, y=dt2, 
                    by.x = c("id", "from_date", "to_date"), 
                    by.y = c("id", "from_date", "to_date"), 
                    type = "any", mult = "all")
    dt <- dt[, c("from_date", "to_date") := NULL] # drop intervals from dt2 because will use the intervals from dt for merging on dt2 below
    setnames(dt, c("i.from_date", "i.to_date"), c("from_date", "to_date") )
    setcolorder(dt, c("id", "from_date", "to_date"))
    setkey(dt, id, from_date, to_date)       

# Collapse data if dates are contiguous and data are the same ----
    # Create unique ID for data chunks ----
    setnames(dt, c("i.progs", "progs"), c("progs1", "progs2"))
    dt[, group := .GRP, by = c("id", "progs1", "progs2")] # create group id
    dt[, group := cumsum( c(0, diff(group)!=0) )] # in situation like a:a:a:b:b:b:b:a:a:a, want to distinguish first set of "a" from second set of "a"    

    # Create unique ID for contiguous times within a given data chunk ----
    setkey(dt, id, from_date)
    dt[, prev_to_date := c(NA, to_date[-.N]), by = "group"]
    dt[, diff.prev := from_date - prev_to_date] # difference between from_date & prev_to_date will be 1 (day) if they are contiguous
    dt[diff.prev != 1, diff.prev := NA] # set to NA if difference is not 1 day, i.e., it is not contiguous, i.e., it starts a new contiguous chunk
    dt[is.na(diff.prev), contig.id := .I] # Give a unique number for each start of a new contiguous chunk (i.e., section starts with NA)
    setkey(dt, group, from_date) # need to order the data so the following line will work.
    dt[, contig.id  := contig.id[1], by=  .( group , cumsum(!is.na(contig.id))) ] # fill forward by group
    dt[, c("prev_to_date", "diff.prev") := NULL] # drop columns that were just intermediates    

    # Collapse rows where data chunks are constant and time is contiguous ----      
    dt[, from_date := min(from_date), by = c("group", "contig.id")]
    dt[, to_date := max(to_date), by = c("group", "contig.id")]
    dt[, c("group", "contig.id") := NULL]
    dt <- unique(dt)      
qdan
  • 45
  • 6
0

I think i understand what you mean, try this - from Base R:

library("data.table")
dt1<- data.table(id = rep(1111, 4),
                 from_date = as.Date(c("2016-01-01", "2016-03-31","2016-09-02", "2016-09-03")), 
                 to_date = as.Date(c("2016-03-15", "2016-09-01", "2016-09-02", "2016-12-15")), 
                 progs1 = c("a1", "b1", "c1", "d1"))

dt2 <- data.table(id = rep(1111, 4),
                 from_date = as.Date(c("2016-02-01", "2016-04-01","2016-11-01", "2016-12-01")), 
                 to_date = as.Date(c("2016-02-28", "2016-09-30", "2016-11-30", "2016-12-31")), 
                 progs2 = c("a2", "b2", "c2", "d2"))

# Full outer join: 

dt3 <- merge(dt1, dt2, by = intersect(colnames(dt1), colnames(dt2)), all = TRUE)
hello_friend
  • 5,682
  • 1
  • 11
  • 15
  • Many thanks for your suggestion! Unfortunately, this doesn't accomplish what I desire. I was hoping to break up each time intervals into the parts that just from one table, both tables, and the other table. For example, the first row of the results using your code show 2016-01-01 >> 016-03-15. However, this should be broken up into 1/1/2016 >> 1/31/2016 for the first table. 2/1/2016 >> 02/28/2016 for the overlap and 2/29/2016 >> 3/15/2016 for the first table again. Thanks again! – qdan Oct 14 '19 at 15:20
0

It's not pretty but here's a hybrid tidyverse/data.table solution that works. It breaks things up into components:

  1. Full join of all possible data combinations between dt1 and dt2 (by ID).
  2. Identify the type of overlap seen on each row (there are 7 permutations) and set up the overlapping dates (_o suffix)
  3. The overlap types require differing numbers of rows to map out the combined start and end dates. Expand out the data frame to provide the number of rows required for each overlap type.
  4. Create the combined dates (_c suffix) based on the overlap type.
  5. Identify which data set a combined date span applies to (enroll_type = dt1, dt2, or both) then drop rows from a single source (dt1/dt2) that are fully covered by an enroll_type of 'both'.
  6. Because of the sorting by ID + date earlier, you can use lead/lag to truncate the combined dates so no date is covered by more than one startdate_c-enddate_c span.

You can probably find ways to make this more elegant and efficient.

library(data.table)
library(tidyr)

#create test data ----
dt1<- data.table(id = rep(1111, 4),
                 from_date = as.Date(c("2016-01-01", "2016-03-31","2016-09-02", "2016-09-03")), 
                 to_date = as.Date(c("2016-03-15", "2016-09-01", "2016-09-02", "2016-12-15")), 
                 progs = c("a1", "b1", "c1", "d1"))
setkey(dt1, id, from_date, to_date)    

dt2<- data.table(id = rep(1111, 4),
                 from_date = as.Date(c("2016-02-01", "2016-04-01","2016-11-01", "2016-12-01")), 
                 to_date = as.Date(c("2016-02-28", "2016-09-30", "2016-11-30", "2016-12-31")), 
                 progs = c("a2", "b2", "c2", "d2"))
setkey(dt2, id, from_date, to_date)    

# create all possible matches between time segments ----
dt <- setDT(mutate(dt1) %>% full_join(., dt2, by = c("id")) )
#dt[, c("progs.y", "progs.x") := NULL]
#setnames(dt, names(dt), c("id", "startdate_dt1", "enddate_dt1", "startdate_dt2", "enddate_dt2"))
setnames(dt, names(dt), c("id", "startdate_dt1", "enddate_dt1", "progs1", "startdate_dt2", "enddate_dt2", "progs2"))

# set up intervals ----
temp <- dt %>%
  mutate(overlap_type = case_when(
    # First ID the non-matches
    is.na(startdate_dt1) | is.na(startdate_dt2) ~ 0,
    # Then figure out which overlapping date comes first
    # Exactly the same dates
    startdate_dt1 == startdate_dt2 & enddate_dt1 == enddate_dt2 ~ 1,
    # dt1 before dt2 (or exactly the same dates)
    startdate_dt1 <= startdate_dt2 & startdate_dt2 <= enddate_dt1 & 
      enddate_dt1 <= enddate_dt2 ~ 2,
    # dt2 before dt1
    startdate_dt2 <= startdate_dt1 & startdate_dt1 <= enddate_dt2 & 
      enddate_dt2 <= enddate_dt1 ~ 3,
    # dt2 dates competely within dt1 dates or vice versa
    startdate_dt2 >= startdate_dt1 & enddate_dt2 <= enddate_dt1 ~ 4,
    startdate_dt1 >= startdate_dt2 & enddate_dt1 <= enddate_dt2 ~ 5,
    # dt1 coverage only before dt2 (or dt2 only after dt1)
    startdate_dt1 < startdate_dt2 & enddate_dt1 < startdate_dt2 ~ 6,
    # dt1 coverage only after dt2 (or dt2 only before dt1)
    startdate_dt1 > enddate_dt2 & enddate_dt1 > enddate_dt2 ~ 7,
    # Any rows that are left
    TRUE ~ 8),
    # Calculate overlapping dates
    startdate_o = as.Date(case_when(
      overlap_type %in% c(1, 2, 4) ~ startdate_dt2,
      overlap_type %in% c(3, 5) ~ startdate_dt1), origin = "1970-01-01"),
    enddate_o = as.Date(ifelse(overlap_type %in% c(1:5),
                               pmin(enddate_dt2, enddate_dt1),
                               NA), origin = "1970-01-01"),
    # Need to duplicate rows to separate out non-overlapping dt1 and dt2 periods
    repnum = case_when(
      overlap_type %in% c(2:5) ~ 3,
      overlap_type %in% c(6:7) ~ 2,
      TRUE ~ 1)
  ) %>%
  select(id, startdate_dt1, enddate_dt1, startdate_dt2, enddate_dt2, 
         startdate_o, enddate_o, overlap_type, repnum) %>%
  arrange(id, startdate_dt1, startdate_dt2, startdate_o, 
          enddate_dt1, enddate_dt2, enddate_o)


### Expand out rows to separate out overlaps ----
temp_ext <- temp[rep(seq(nrow(temp)), temp$repnum), 1:ncol(temp)]

## process expanded ----
temp_ext <- temp_ext %>% 
  group_by(id, startdate_dt1, enddate_dt1, startdate_dt2, enddate_dt2) %>% 
  mutate(rownum_temp = row_number()) %>%
  ungroup() %>%
  arrange(id, startdate_dt1, enddate_dt1, startdate_dt2, enddate_dt2, startdate_o, 
          enddate_o, overlap_type, rownum_temp) %>%
  mutate(
    # Remove non-overlapping dates
    startdate_dt1 = as.Date(ifelse((overlap_type == 6 & rownum_temp == 2) | 
                                   (overlap_type == 7 & rownum_temp == 1), 
                                 NA, startdate_dt1), origin = "1970-01-01"), 
    enddate_dt1 = as.Date(ifelse((overlap_type == 6 & rownum_temp == 2) | 
                                 (overlap_type == 7 & rownum_temp == 1), 
                               NA, enddate_dt1), origin = "1970-01-01"),
    startdate_dt2 = as.Date(ifelse((overlap_type == 6 & rownum_temp == 1) | 
                                   (overlap_type == 7 & rownum_temp == 2), 
                                 NA, startdate_dt2), origin = "1970-01-01"), 
    enddate_dt2 = as.Date(ifelse((overlap_type == 6 & rownum_temp == 1) | 
                                 (overlap_type == 7 & rownum_temp == 2), 
                               NA, enddate_dt2), origin = "1970-01-01")) %>%
  distinct(id, startdate_dt1, enddate_dt1, startdate_dt2, enddate_dt2, startdate_o, 
           enddate_o, overlap_type, rownum_temp, .keep_all = TRUE) %>%
  # Remove first row if start dates are the same or dt1 is only one day
  filter(!(overlap_type %in% c(2:5) & rownum_temp == 1 & 
             (startdate_dt1 == startdate_dt2 | startdate_dt1 == enddate_dt1))) %>%
  # Remove third row if enddates are the same
  filter(!(overlap_type %in% c(2:5) & rownum_temp == 3 & enddate_dt1 == enddate_dt2))

##  Calculate the finalized date columns----
### Calculate finalized date columns
temp_ext <- temp_ext %>%
  # Set up combined dates
  mutate(
    # Start with rows with only dt1 or dt2, or when both sets of dates are identical
    startdate_c = as.Date(
      case_when(
        (!is.na(startdate_dt1) & is.na(startdate_dt2)) | overlap_type == 1 ~ startdate_dt1,
        !is.na(startdate_dt2) & is.na(startdate_dt1) ~ startdate_dt2), origin = "1970-01-01"),
    enddate_c = as.Date(
      case_when(
        (!is.na(enddate_dt1) & is.na(enddate_dt2)) | overlap_type == 1 ~ enddate_dt1,
        !is.na(enddate_dt2) & is.na(enddate_dt1) ~ enddate_dt2), origin = "1970-01-01"),
    # Now look at overlapping rows and rows completely contained within the other data's dates
    startdate_c = as.Date(
      case_when(
        overlap_type %in% c(2, 4) & rownum_temp == 1 ~ startdate_dt1,
        overlap_type %in% c(3, 5) & rownum_temp == 1 ~ startdate_dt2,
        overlap_type %in% c(2:5) & rownum_temp == 2 ~ startdate_o,
        overlap_type %in% c(2:5) & rownum_temp == 3 ~ enddate_o + 1,
        TRUE ~ startdate_c), origin = "1970-01-01"),
    enddate_c = as.Date(
      case_when(
        overlap_type %in% c(2:5) & rownum_temp == 1 ~ lead(startdate_o, 1) - 1,
        overlap_type %in% c(2:5) & rownum_temp == 2 ~ enddate_o,
        overlap_type %in% c(2, 5) & rownum_temp == 3 ~ enddate_dt2,
        overlap_type %in% c(3, 4) & rownum_temp == 3 ~ enddate_dt1,
        TRUE ~ enddate_c), origin = "1970-01-01"),
    # Deal with the last line for each person if it's part of an overlap
    startdate_c = as.Date(ifelse((id != lead(id, 1) | is.na(lead(id, 1))) &
                                   overlap_type %in% c(2:5) & 
                                   enddate_dt1 != enddate_dt2, 
                                 lag(enddate_o, 1) + 1, 
                                 startdate_c), origin = "1970-01-01"),
    enddate_c = as.Date(ifelse((id != lead(id, 1) | is.na(lead(id, 1))) &
                                 overlap_type %in% c(2:5), 
                               pmax(enddate_dt1, enddate_dt2, na.rm = TRUE), 
                               enddate_c), origin = "1970-01-01")
  ) %>%
  arrange(id, startdate_c, enddate_c, startdate_dt1, startdate_dt2, 
          enddate_dt1, enddate_dt2, overlap_type) %>%
  mutate(
    # Identify which type of enrollment this row represents
    enroll_type = 
      case_when(
        (overlap_type == 2 & rownum_temp == 1) | 
          (overlap_type == 3 & rownum_temp == 3) |
          (overlap_type == 6 & rownum_temp == 1) | 
          (overlap_type == 7 & rownum_temp == 2) |
          (overlap_type == 4 & rownum_temp %in% c(1, 3)) |
          (overlap_type == 0 & is.na(startdate_dt2)) ~ "dt1",
        (overlap_type == 3 & rownum_temp == 1) | 
          (overlap_type == 2 & rownum_temp == 3) |
          (overlap_type == 6 & rownum_temp == 2) | 
          (overlap_type == 7 & rownum_temp == 1) | 
          (overlap_type == 5 & rownum_temp %in% c(1, 3)) |
          (overlap_type == 0 & is.na(startdate_dt1)) ~ "dt2",
        overlap_type == 1 | (overlap_type %in% c(2:5) & rownum_temp == 2) ~ "both",
        TRUE ~ "x"
      ),
    # Drop rows from enroll_type == h/m when they are fully covered by an enroll_type == b
    drop = 
      case_when(
        id == lag(id, 1) & !is.na(lag(id, 1)) & 
          startdate_c == lag(startdate_c, 1) & !is.na(lag(startdate_c, 1)) &
          enddate_c >= lag(enddate_c, 1) & !is.na(lag(enddate_c, 1)) & 
          # Fix up quirk from dt1 data where two rows present for the same day
          !(lag(enroll_type, 1) != "dt2" & lag(enddate_dt1, 1) == lag(startdate_dt1, 1)) &
          enroll_type != "both" ~ 1,
        id == lead(id, 1) & !is.na(lead(id, 1)) & 
          startdate_c == lead(startdate_c, 1) & !is.na(lead(startdate_c, 1)) &
          enddate_c <= lead(enddate_c, 1) & !is.na(lead(enddate_c, 1)) & 
          # Fix up quirk from dt1 data where two rows present for the same day
          !(lead(enroll_type, 1) != "dt2" & lead(enddate_dt1, 1) == lead(startdate_dt1, 1)) &
          enroll_type != "both" & lead(enroll_type, 1) == "both" ~ 1,
        # Fix up other oddities when the date range is only one day
        id == lag(id, 1) & !is.na(lag(id, 1)) & 
          startdate_c == lag(startdate_c, 1) & !is.na(lag(startdate_c, 1)) &
          startdate_c == enddate_c & !is.na(startdate_c) & 
          ((enroll_type == "dt2" & lag(enroll_type, 1) %in% c("both", "dt1")) |
             (enroll_type == "dt1" & lag(enroll_type, 1) %in% c("both", "dt2"))) ~ 1,
        id == lag(id, 1) & !is.na(lag(id, 1)) & 
          startdate_c == lag(startdate_c, 1) & !is.na(lag(startdate_c, 1)) &
          startdate_c == enddate_c & !is.na(startdate_c) &
          startdate_dt1 == lag(startdate_dt1, 1) & enddate_dt1 == lag(enddate_dt1, 1) &
          !is.na(startdate_dt1) & !is.na(lag(startdate_dt1, 1)) &
          enroll_type != "both" ~ 1,
        id == lead(id, 1) & !is.na(lead(id, 1)) & 
          startdate_c == lead(startdate_c, 1) & !is.na(lead(startdate_c, 1)) &
          startdate_c == enddate_c & !is.na(startdate_c) &
          ((enroll_type == "dt2" & lead(enroll_type, 1) %in% c("both", "dt1")) |
             (enroll_type == "dt1" & lead(enroll_type, 1) %in% c("both", "dt2"))) ~ 1,
        # Drop rows where the enddate_c < startdate_c due to 
        # both data sources' dates ending at the same time
        enddate_c < startdate_c ~ 1,
        TRUE ~ 0
      )
  ) %>%
  filter(drop == 0 | is.na(drop)) %>%
  # Truncate remaining overlapping end dates
  mutate(enddate_c = as.Date(
    ifelse(id == lead(id, 1) & !is.na(lead(startdate_c, 1)) &
             startdate_c < lead(startdate_c, 1) &
             enddate_c >= lead(enddate_c, 1),
           lead(startdate_c, 1) - 1,
           enddate_c),
    origin = "1970-01-01")
  ) %>%
  select(-drop, -repnum, -rownum_temp) %>%
  # With rows truncated, now additional rows with enroll_type == h/m that 
  # are fully covered by an enroll_type == b
  # Also catches single day rows that now have enddate < startdate
  mutate(
    drop = case_when(
      id == lag(id, 1) & startdate_c == lag(startdate_c, 1) &
        enddate_c == lag(enddate_c, 1) & lag(enroll_type, 1) == "both" & 
        enroll_type != "both" ~ 1,
      id == lead(id, 1) & startdate_c == lead(startdate_c, 1) &
        enddate_c <= lead(enddate_c, 1) & lead(enroll_type, 1) == "both" ~ 1,
      id == lag(id, 1) & startdate_c >= lag(startdate_c, 1) &
        enddate_c <= lag(enddate_c, 1) & enroll_type != "both" &
        lag(enroll_type, 1) == "both" ~ 1,
      id == lead(id, 1) & startdate_c >= lead(startdate_c, 1) &
        enddate_c <= lead(enddate_c, 1) & enroll_type != "both" &
        lead(enroll_type, 1) == "both" ~ 1,
      TRUE ~ 0)
  ) %>%
  filter(drop == 0 | is.na(drop)) %>%
  select(id, startdate_c, enddate_c, enroll_type)
marc_s
  • 732,580
  • 175
  • 1,330
  • 1,459