5

I'm working with a set of date intervals where each interval has a version number and new intervals will frequently overlap old ones, or even be subsets of them. From this data I need to calculate a new set of intervals that shows the most recent version number, at each point in time. Is there a set-based solution to this problem?

Here's an illustration:

Interval 1: 11111111111111111111111      
Interval 2:     2222222222               
Interval 3:   33333333333333             
Interval 4:                     444444444
Interval 5:                   555555555  
Result    : 11333333333333331155555555544

Here is a sample of the data I'm working with:

groupId   startDate  endDate     version
--------  ---------  ----------  ------
1         1/1/2010   1/1/2011    1
1         10/1/2010  7/5/2011    2
1         7/5/2011   8/13/2012   3
1         8/13/2012  12/31/2012  6
1         10/1/2012  11/1/2012   8

... and the desired output:

groupId   startDate  endDate     version
--------  ---------  ----------  ------
1         1/1/2010   10/1/2010   1
1         10/1/2010  7/5/2011    2
1         7/5/2011   8/13/2012   3
1         8/13/2011  10/1/2012   6
1         10/1/2012  11/1/2012   8 << note how version 8 supersedes version 6
1         11/1/2012  12/31/2012  6 << version 6 is split into two records

I haven't found any other examples of this problem, my googling only turns up queries that identify gaps and islands or covering sets.

I think I have an iterative solution (SQL Server 2008). It starts with a temp table for intervals in the result set and defines the start and end points for the range that we want to cover by inserting records with special version numbers. Then, it repeatedly identifies gaps between result set intervals and attempts to fill them with the most recent records from the original data set, until there are no more gaps or no more records to add:

GO
-- Create data set and results table
CREATE TABLE #Data (
     groupId    INT
    ,startDate  DATE
    ,endDate    DATE
    ,versionId  INT
)

INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2007-12-22', '2008-12-22', 8)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2008-12-22', '2009-12-22', 9)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2009-12-22', '2010-12-22', 10)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2010-12-22', '2011-12-22', 11)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2011-01-01', '2011-11-30', 500)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2011-12-22', '2012-12-22', 12)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-01-22', '2012-12-22', 13)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-01-22', '2012-12-22', 14)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-04-22', '2012-12-22', 17)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (1, '2012-04-22', '2012-12-22', 19)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2010-01-01', '2011-01-01', 1)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2010-10-01', '2011-07-05', 2)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2011-07-05', '2012-08-13', 3)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2012-08-13', '2012-12-31', 6)
INSERT INTO #Data (groupId, startDate, endDate, versionId) VALUES (2, '2012-10-01', '2012-11-01', 8)


CREATE TABLE #Results (
     groupId        VARCHAR(10)
    ,startDate  DATE
    ,endDate    DATE
    ,versionId      BIGINT
)

DECLARE @startDate      DATE
DECLARE @endDate        DATE
DECLARE @placeholderId  BIGINT

SET @startDate = '20030101'
SET @endDate = '20121231'
SET @placeholderId = 999999999999999

INSERT #Results
SELECT DISTINCT
     groupId
    ,CASE WHEN MIN(startDate) < @startDate THEN MIN(startDate) ELSE @startDate END
    ,CASE WHEN MIN(startDate) < @startDate THEN @startDate ELSE MIN(startDate) END
    ,@placeholderId
FROM #data
GROUP BY groupId
UNION ALL
SELECT DISTINCT
     groupId
    ,CASE WHEN MAX(endDate) < @endDate THEN MAX(endDate) ELSE @endDate END
    ,CASE WHEN MAX(endDate) < @endDate THEN @endDate ELSE MAX(endDate) END
    ,@placeholderId
FROM #data
GROUP BY groupId
GO

-- Fill gaps in results table
DECLARE @startDate      DATE
DECLARE @endDate        DATE
DECLARE @placeholderId  BIGINT

SET @startDate = '20030101'
SET @endDate = '20111231'
SET @placeholderId = 999999999999999

DECLARE @counter INT
SET @counter = 0

WHILE @counter < 10
BEGIN
    SET @counter = @counter + 1;
    WITH Gaps AS (
        SELECT
             gs.groupId
            ,gs.startDate
            ,MIN(ge.endDate) as endDate
            ,ROW_NUMBER() OVER (ORDER BY gs.groupId, gs.startDate) as gapId
        FROM (
            SELECT groupId, endDate as startDate
            FROM #Results r1 
            WHERE NOT EXISTS (
                    SELECT * 
                    FROM #Results r2 
                    WHERE r2.groupId = r1.groupId
                        AND r2.versionId <> r1.versionId
                        AND r2.startDate <= r1.endDate
                        AND r2.endDate > r1.endDate
                )
                AND NOT (endDate >= @endDate AND versionId = @placeholderId)
        ) gs
        INNER JOIN (
            SELECT groupId, startDate as endDate
            FROM #Results r1 
            WHERE NOT EXISTS (
                    SELECT * 
                    FROM #Results r2 
                    WHERE r2.groupId = r1.groupId
                        AND r2.versionId <> r1.versionId
                        AND r2.endDate >= r1.startDate
                        AND r2.startDate < r1.startDate
                )
                AND NOT (startDate <= @startDate AND versionId = @placeholderId)
        ) ge
            ON ge.groupId = gs.groupId
            AND ge.endDate >= gs.startDate
        GROUP BY gs.groupId, gs.startDate
    )
    INSERT #Results (
         groupId
        ,startDate
        ,endDate
        ,versionId
    )
    SELECT
         d.groupId
        ,CASE WHEN d.startDate < g.startDate THEN g.startDate ELSE d.startDate END
        ,CASE WHEN d.endDate > g.endDate THEN g.endDate ELSE d.endDate END
        ,d.versionId
    FROM #Data d
    INNER JOIN Gaps g
        ON g.groupId = d.groupId
        AND g.startDate <= d.endDate
        AND g.endDate >= d.startDate
    INNER JOIN (
        SELECT 
             d.groupId
            ,gapId
            ,MAX(d.versionId) as versionId
        FROM #Data d
        INNER JOIN Gaps g
            ON g.groupId = d.groupId
            AND g.startDate <= d.endDate
            AND g.endDate >= d.startDate
        WHERE d.versionId < (
                SELECT MIN(versionId)
                FROM #Results r
                WHERE r.groupId = d.groupId
                    AND (r.startDate = g.endDate OR r.endDate = g.startDate)
            )
            AND NOT EXISTS (
                SELECT *
                FROM #Data dsup
                WHERE dsup.groupId = d.groupId
                    AND dsup.versionId > d.versionId
                    AND dsup.startDate <= d.startDate
                    AND dsup.endDate >= d.endDate
            )
        GROUP BY
             d.groupId
            ,g.gapId
    ) mg
        ON mg.groupId = g.groupId
        AND mg.gapId = g.gapId
        AND mg.versionId = d.versionId
END

SELECT *
FROM #Results
WHERE versionId <> @placeholderId
order by groupId, startDate

A set-based solution would be much more useful, but I've struggled to find one. Any ideas?

Community
  • 1
  • 1

3 Answers3

4
-- create a dates table
create table dates (thedate date primary key clustered);
;with dates(thedate) as (
  select dateadd(yy,years.number,0)+days.number
    from master..spt_values years
    join master..spt_values days
      on days.type='p' and days.number < datepart(dy,dateadd(yy,years.number+1,0)-1)
   where years.type='p' and years.number between 100 and 150
      -- note: 100-150 creates dates in the year range 2000-2050
      --       adjust as required
)
insert dbo.dates select * from dates;

-- for each date, determine the prevailing version
  select t.groupId, d.thedate, max(t.versionId) versionId
    into #tmp1
    from dates d
    join #Data t on t.startDate <= d.thedate and d.thedate <= t.endDate
group by t.groupId, d.thedate;

-- create index to help
create clustered index cix_tmp1 on #tmp1(groupId, thedate, versionId);

-- find the start dates
;with t as (
   select a.*, rn=row_number() over (partition by a.groupId order by a.thedate)
     from #tmp1 a
left join #tmp1 b on b.thedate = dateadd(d,-1,a.thedate) and a.groupId = b.groupId and a.versionId = b.versionId
    where b.versionId is null
)
   select c.groupId, c.thedate startdate, dateadd(d,-1,d.thedate) enddate, c.versionId
     from t c
left join t d on d.rn=c.rn+1 and c.groupId = d.groupId
 order by groupId, startdate;

Of course, you can do everything in "one query" but do it at your peril, as the performance goes down the drain, big time.

DO NOT USE - for academic interest only-

;with dates(thedate) as (
  select dateadd(yy,years.number,0)+days.number
    from master..spt_values years
    join master..spt_values days
      on days.type='p' and days.number < datepart(dy,dateadd(yy,years.number+1,0)-1)
   where years.type='p' and years.number between 100 and 150
      -- note: 100-150 creates dates in the year range 2000-2050
      --       adjust as required
), tmp1 as (
  select t.groupId, d.thedate, max(t.versionId) versionId
    from dates d
    join #Data t on t.startDate <= d.thedate and d.thedate <= t.endDate
group by t.groupId, d.thedate
), t as (
   select a.*, rn=row_number() over (partition by a.groupId order by a.thedate)
     from tmp1 a
left join tmp1 b on b.thedate = dateadd(d,-1,a.thedate) and a.groupId = b.groupId and a.versionId = b.versionId
    where b.versionId is null
)
   select c.groupId, c.thedate startdate, dateadd(d,-1,d.thedate) enddate, c.versionId
     from t c
left join t d on d.rn=c.rn+1 and c.groupId = d.groupId
 order by groupId, startdate;
RichardTheKiwi
  • 105,798
  • 26
  • 196
  • 262
  • Thanks for the quick response! Ran with the test data, the result looks great. I'll run it with my large data set later and post performance results for my iterative solution and your multi-query solution. – ExcelValdez Nov 29 '12 at 21:19
  • If the versions can have gaps in, then the end date calculation fails. This doesn't happen in the example data, though: http://sqlfiddle.com/#!6/ec8dc/1 – Laurence Nov 29 '12 at 21:26
  • @Laurence You're right. I assumed it was something serial and non-breaking. Like Firefox versions where it was 14 for a while, retracted, and reinstated. Or was it 16... but never without a version! – RichardTheKiwi Nov 29 '12 at 21:29
  • This is a bit more robust than the answer I posted, although mine does have the benefit of being a single query and fairly efficient; I've run it on several million segments in a very reasonable run time (~2 minutes on mid-class hardware). This is a problem that comes up a lot in my job and we've built what I think is a rather elegant solution for weaving time segments... I was sad that I missed first post. ;-) – Chipmonkey Nov 29 '12 at 21:34
  • @Chipmonkey Measuring by the number of queries is moot statistics. Try my pattern where you have used your `single query` and you should see some improvements chaining through the indexed and precalculated temp tables. – RichardTheKiwi Nov 29 '12 at 21:37
  • @Laurence Good catch, I will have to look for gaps between versions. – ExcelValdez Nov 29 '12 at 22:17
  • It's not just the number of queries I'm trying to optimize; the fact that it's a single query is a nice effect of the solution approach. I felt it was rather efficient, as it daisy chains date segments without having to do the implicit cartesian join to all dates, which is why your second "academic" example is inefficient. Unfortunately the daisy chaining makes some assumptions, that I may not have time to fix just now. :-( FWIW, a combination of our two approaches should be efficient and fairly easy to implement. – Chipmonkey Nov 29 '12 at 22:21
1

Updated due to some feedback from the comments. I'm not going to worry about the end cases that a few people have pointed out since they've been proven trivial to solve in other Answers, but I wanted to go ahead and get a working version out that didn't require DDL... I figure it's just good to have options. :-)

This code should work:

select nesty.groupId, nesty.startDate, nesty.segment_end_date, Max(bob.versionId)
from(
select starter.groupId, starter.startDate,
coalesce(DATEADD(DAY,-1,ender.startDate),('2012-12-31')) AS segment_end_date
from
(select groupId, startDate, ROW_NUMBER() over (partition by groupID order by startDate) as rownumber from
    (select groupID, startDate from #Data union select groupID, DATEADD(DAY, 1,endDate) as startDate from #Data) xx) starter
left outer join
(select groupId, startDate, ROW_NUMBER() over (partition by groupID order by startDate) as rownumber from
    (select groupID, startDate from #Data union select groupID, DATEADD(DAY, 1,endDate)    as startDate from #Data) xy) ender on
    starter.groupId = ender.groupId and
    starter.rownumber = ender.rownumber - 1
where
starter.startDate<= coalesce(DATEADD(DAY,-1,ender.startDate),('2012-12-31'))
) nesty
left outer join #Data bob on
bob.groupId = nesty.groupId and
nesty.segment_end_date between bob.startDate and bob.endDate
group by nesty.groupId, nesty.startDate, nesty.segment_end_date
order by nesty.groupId, nesty.startDate

There are a couple of tiny caveats I had to do to get it into a single SQL statement. First, the max end date is not dynamic; I hard coded '2012-12-31'. You can replace it with a MAX(endDate), but you can't put that in the GROUP BY statement. If you can do this in a procedure, you can do:

select into @max_end_date MAX(endDate) from #Data

and replace '2012-12-31' with @max_end_date.

Second, I do not guarantee that two adjacent segments won't have the same value! This may or may not be important to you... that is, if you had the following:

Interval 1:       111111      
Interval 2:   22222222222222

Your output would be:

Interval 1:   2222
Interval 2:       2222222222

Still, I think it's worth hitting it in a simple and efficient SQL query. It may not be hard to fix those caveats, but it didn't matter to what I was working on, so I haven't bothered yet.

Chipmonkey
  • 863
  • 7
  • 18
  • This doesn't seem to work with the example data, it should flip back to v6 at the end: http://sqlfiddle.com/#!6/9d2a2/1 – Laurence Nov 29 '12 at 21:34
  • Ah, you're right... I'm bummed about that. Also, per the comments on the other answer, this solution will not work for dates between the earliest start and latest end dates where there are gaps with no versions. Don't have the time to rewrite it now, but I may take a stab later. – Chipmonkey Nov 29 '12 at 21:36
  • I don't know if you've copied something wrong when translating it to the OP's tables, but this just doesn't work, following shouldn't show version 1 at all http://sqlfiddle.com/#!6/5b345/1. Neither example has any gaps. – Laurence Nov 29 '12 at 21:54
  • I can't edit my answer for some reason; StackOverflow is kicking a URL error... let me know if this works, though for now (it works for me, on both the cases you mentioned): http://chiplynch.com/files/20121119_stackoverflow.txt – Chipmonkey Nov 29 '12 at 22:12
  • It's a lot better, but it does have a strange interval with null version in http://sqlfiddle.com/#!6/6e503/1. I think if you just ignore that, though then it's right. – Laurence Nov 29 '12 at 22:32
  • Sorry, that's because I typed the example data in wrong, Yes it works now with anything except gaps: http://sqlfiddle.com/#!6/da30f/1. In fact, I think the gap is correct in the previous fiddle too. – Laurence Nov 29 '12 at 22:52
  • OK, whatever my stackoverflow problem was is fixed (plus I'm back from dinner!). I updated the code in the Answer. I'm not going to mess with the edge cases any more, although they should be easy to fix, since the previous answer was accepted. This is still about as efficient as you'll get without DDL or additional tables or indexes, although the other two answers do certainly have some benefits. – Chipmonkey Nov 30 '12 at 01:33
  • +1, I always find it interesting how many different approaches there are for problems like this. – Laurence Nov 30 '12 at 10:05
0

If the end dates are important, as well as gaps, here's a way you can do it. This solution could also be adapted to work if your versions are datetimes instead of just dates.

First a bunch of functions

One to get the version at a given date

Create Function dbo.VersionAtDate(@GroupID int, @Date datetime) Returns int as
Begin
  Declare @Ret int = Null
  Select
    @Ret = Max(VersionID)
  From
    VersionedIntervals iv
  Where
    iv.GroupID = @GroupID And
    iv.StartDate <= @Date And
    iv.EndDate + 1 > @Date -- if dates were half open intervals this would just be iv.EndDate > @Date
  Return @Ret
End

Next to get the midpoint of two datetimes (minute resolution):

Create Function dbo.Midpoint(@Start datetime, @End datetime) Returns datetime as
Begin
  Return DateAdd(Minute, DateDiff(Minute, @Start, @End) / 2, @Start)
End

Version at a midpoint:

Create Function dbo.VersionAtMidpoint(@GroupID int, @Start datetime, @End datetime) returns int as
Begin
  Return dbo.VersionAtDate(@GroupID, dbo.Midpoint(@Start, @End))
End;

Finally a table valued function to help with the fact that some points are the start of one range and the end of another, and it helps to get two rows from one input for this:

-- returns two rows if a point is the end of one interval and the
-- start of another
Create Function dbo.EndPoints(@GroupID int, @RN bigint, @Start datetime, @End datetime, @Next datetime, @Version int)
Returns @EndPoints Table (
    GroupID int,
    RN bigint,
    Version int,
    StartDate datetime,
    EndDate datetime
) As
Begin
  Declare @NextVersion int, @VersionAtMidpoint int
  Set @NextVersion = dbo.VersionAtDate(@GroupID, @Next)
  If @NextVersion = @Version
    -- interval carries on
    Insert Into @EndPoints Select @GroupID, @RN, @Version, @Start, @Next
  Else
  Begin
    -- interval has ended
    Set @VersionAtMidpoint = dbo.VersionAtMidPoint(@GroupID, @End, @Next)
    If @VersionAtMidpoint != @Version
        -- we have something like this, start a run of 3s (run of 4s is already ended by previous call)
        -- 3333333
        -- 44     
        Insert Into @EndPoints Select @GroupID, @RN, @VersionAtMidpoint, @End, @Next 
    Else
    Begin
        -- We have something like this, end the run of 3s and start the run of fours
        -- 33333
        --   444
        Insert Into @EndPoints Select @GroupID, -1, @Version, @Start, @Next
        Insert Into @EndPoints Select @GroupID, @RN, @NextVersion, @Next, @Next
    End
  End
  Return
End

With all this machinery in place, finally a recursive CTE plust table variable, you'll need to set maxrecursion appropriately:

Declare @Bounds Table (GroupID int, RN bigint, BoundDate datetime, Primary Key (GroupID, RN))

Insert Into
    @Bounds
Select
    GroupID,
    Row_Number() Over (Partition By GroupID Order By BoundDate),
    BoundDate
From (
    Select
        GroupID,
        StartDate As BoundDate
    From
        dbo.VersionedIntervals
    Union
    Select
        GroupID,
        EndDate
    From
        dbo.VersionedIntervals
    ) a

;With VersionedBounds (GroupID, RN, StartDate, EndDate, Version) as (
    Select
        GroupID,
        RN,
        BoundDate,
        BoundDate,
        dbo.VersionAtDate(GroupID, BoundDate)
    From
        @Bounds
    Where
        RN = 1
    Union All
    Select
        e.GroupID,
        e.RN,
        e.StartDate,
        e.EndDate,
        e.Version
    From
        @Bounds b
            Inner Join
        VersionedBounds v
            On v.GroupID = b.GroupID And b.RN = v.RN + 1
            Cross Apply 
        dbo.EndPoints(v.GroupID, b.RN, v.StartDate, v.EndDate, b.BoundDate, v.Version) e
)
Select 
    GroupID,
    StartDate,
    Max(EndDate) As EndDate,
    Max(Version) As Version
From
    VersionedBounds
Group By
    GroupID,
    StartDate
Order By
    GroupID,
    StartDate

http://sqlfiddle.com/#!6/b95bd/2

Laurence
  • 10,896
  • 1
  • 25
  • 34