Comparing two sets of date ranges in SQL

I have two datasets with different date ranges.

Tbl 1:  
ID, Date_Start, Date_End
1, 2010-01-01, 2010-01-09
1, 2010-01-10, 2010-01-19
1, 2010-01-30, 2010-01-31

Tbl 2:
ID, Date_Start, Date_End
1, 2010-01-01, 2010-01-04
1, 2010-01-08, 2010-01-17
1, 2010-01-30, 2010-01-31

I would like to find cases where the date ranges do not completely overlap the date ranges in Tbl 2. So, for example, in this example I would like the output to look something like this:

Output:
ID, Gap_Start, Gap_End
1, 2010-01-05, 2010-01-07
1, 2010-01-18, 2010-01-19

Date ranges will never overlap within a table. For this, I use either DB2 SQL or SAS. Unfortunately, the data sets are large enough (millions of records) that I cannot just overdo it.

Thank!

+3
source share
4 answers

Following the approach of Jon of All Trades, this is a more complete solution. Key features are:

  • Use the calendar helper table, which is just a list of all dates.
  • JOIN Tbl1, , .
  • anti-JOIN Tbl2, , Tbl2.
  • Common Table Expression (CTE) OutDates.
  • CTE, OutDates, , ; .
  • CTE OutDates, , ; LatestDates.
  • JOIN EarliestDates LatestDates .

WITH
OutDates(ID, dt) AS
( SELECT Tbl1.ID, Calendar.dt FROM Calendar
INNER JOIN Tbl1 ON Calendar.dt BETWEEN Tbl1.Date_Start AND Tbl1.Date_End
LEFT OUTER JOIN Tbl2 ON Calendar.dt BETWEEN Tbl2.Date_Start AND Tbl2.Date_End
WHERE Tbl2.ID IS NULL
)
,
EarliestDates AS
(   SELECT earliest.ID, earliest.dt FROM OutDates earliest
    LEFT OUTER JOIN OutDates nonesuch_earlier ON DateAdd(day, -1, earliest.dt) = nonesuch_earlier.dt
    WHERE nonesuch_earlier.ID IS NULL
)
,
LatestDates AS
(   SELECT latest.ID, latest.dt FROM OutDates latest
    LEFT OUTER JOIN OutDates nonesuch_later ON DATEADD(day, 1, latest.dt) = nonesuch_later.dt
    WHERE nonesuch_later.ID IS NULL
)
SELECT rangestart.ID, rangestart.dt AS Gap_Start, rangeend.dt AS Gap_End 
 FROM EarliestDates rangestart JOIN LatestDates rangeend
 ON rangestart.dt <= rangeend.dt
LEFT OUTER JOIN EarliestDates nonesuch_inner1
 ON nonesuch_inner1.dt <= rangeend.dt AND nonesuch_inner1.dt > rangestart.dt 
LEFT OUTER JOIN LatestDates nonesuch_inner2
 ON nonesuch_inner2.dt >= rangestart.dt AND nonesuch_inner2.dt < rangeend.dt
WHERE nonesuch_inner1.dt IS NULL AND nonesuch_inner2.dt IS NULL

Sql Server , DB2. , , , .

+1

, . . , , : (1) ; (2) (, 2010 ). , .

/* test data */
data one;
  input id1 (start1 finish1) (:anydtdte.);
  format start1 finish1 e8601da.;
cards;
1 2010-01-01 2010-01-09
1 2010-01-10 2010-01-19
1 2010-01-30 2010-01-31
2 2010-01-02 2010-01-10
;
run;

data two;
  input id2 (start2 finish2) (:anydtdte.);
  format start2 finish2 e8601da.;
cards;
1 2010-01-01 2010-01-04
1 2010-01-08 2010-01-17
1 2010-01-30 2010-01-31
2 2010-01-05 2010-01-06
;
run;


/* assumptions:
   (1) datasets one and two have the same set of ids in the same
       sorted order;
   (2) only possible dates are in the year of 2010
*/
%let minDate = %sysevalf('01jan2010'd - 1);
%let maxDate = %sysevalf('31dec2010'd + 1);

data gaps;

  array inRange[&minDate:&maxDate] _temporary_;
  array covered[&minDate:&maxDate] _temporary_;
  do i = &minDate to &maxDate; inRange[i] = 0; covered[i] = 0; end;

  do until (last.id1);
    set one;
    by id1;
    do i = start1 to finish1; inRange[i] = 1; end;
  end;

  do until (last.id2);
    set two;
    by id2;
    do i =  start2 to finish2; covered[i] = 1; end;
  end;

  format startGap finishGap e8601da.;
  startGap = .;
  finishGap = .;
  do i = &minDate+1 to &maxDate;
    if inRange[i] and not covered[i] and missing(startGap) then startGap = i;
    if (covered[i] or not inRange[i]) and not missing(startGap) and not covered[i-1] then do;
      finishGap = i - 1;
      output;
      call missing(startGap, finishGap);
      keep id1 startGap finishGap;
    end;
  end;     
run;

/* check */
proc print data=gaps noobs;
run; 
/* on lst 
id1     startGap     finishGap

 1     2010-01-05    2010-01-07
 1     2010-01-18    2010-01-19
 2     2010-01-02    2010-01-04
 2     2010-01-07    2010-01-10
*/
+1

, , , , , :

SELECT
  R1.ID, D.Date
FROM
  #Ranges1 AS R1
  INNER JOIN Dates AS D ON D.Date BETWEEN R1.StartDate AND R1.EndDate
EXCEPT
SELECT
  R2.ID, D.Date
FROM
  #Ranges2 AS R2
  INNER JOIN Dates AS D ON D.Date BETWEEN R2.StartDate AND R2.EndDate

, : , , , . ( , , , ).

+1

, , . , SQL, .

1 - . , -

ID, Start_Date, End_Date
1,  2010-01-01, 2010-01-31
1,  2010-02-01, 2010-02-28

-

ID, Start_Date, End_Date
1,  2010-01-01, 2010-02-28.

, ,

WITH Cte_recomb (Id, Start_date, End_date, Hopcount) AS
        (SELECT Id,
                Start_date,
                End_date,
                1 AS Hopcount
         FROM Table1
         UNION ALL
         SELECT Cte_recomb.Id,
                Cte_recomb.Start_date,
                Table1.End_date,
                (Recomb.Hopcount + 1) AS Hopcount
         FROM Cte_recomb, Table1
         WHERE (Cte_recomb.Id = Table1.Id) AND
               (Cte_recomb.End_date + 1 day = Table1.Start_date)),
     Cte_maxenddate AS
        (SELECT Id,
                Start_date,
                Max (End_date) AS End_date
         FROM Cte_recomb
         GROUP BY Id, Start_date
         ORDER BY Id, Start_date)
SELECT Maxend.*
FROM    Cte_maxenddate AS Maxend
     LEFT JOIN
        Cte_recomb AS Nextrec
     ON (Nextrec.Id = Maxend.Id) AND
        (Nextrec.Start_date < Maxend.Start_date) AND
        (Nextrec.End_date >= Maxend.End_date)
WHERE Nextrec.Id IS NULL;

2 -

I created another dataset that created a record for each overlap between two datasets. You will need an additional step to search for cases when the specified record in table 1 does not have a corresponding record in table 2 at all.

SELECT Table1.Id,
       Table1.Start_date AS Table1_start_date,
       Table1.End_date AS Table1_end_date,
       Table2.Start_date AS Table2_start_date,
       Table2.End_date AS Table2_end_date
FROM    Table1
     INNER JOIN
        Table2
     ON (Table1.Plcy_id_sk = Id) AND
        ( (Table1.Start_date BETWEEN Table2.Start_date AND Table2.End_date) OR
         (Table2.Start_date  BETWEEN Table1.Start_date AND Table1.End_date)) AND
        ( (Table1.Start_date <> Table2.Start_date) OR
         (Table1.End_date    <> Table2.End_date))
ORDER BY Table1.Id, Table1.Start_date, Table2.Start_date;

Step 3 -

I take the above dataset and run the next SAS job. I tried to do this in pure SQL with recursive queries, but it became more and more ugly and ugly every time I looked at it.

Data Table1_Gaps;
  Set Table1_Compare;
  By ID Table1_Start_Date Table2_Start_Date;
  format Gap_Start_Date yymmdd10.;
  format Gap_End_Date   yymmdd10.;
  format Old_Start_Date yymmdd10.;
  format Old_End_Date   yymmdd10.;
  Retain Old_Start_Date Old_End_Date;
  IF (Table2_End_Date = .) then do;
      Gap_Start_Date = Table1_Start_Date;
      Gap_End_Date   = Table1_End_Date;
      output;
  end;
  else do;
    If (Table2_Start_Date > Table1_Start_Date) then do;
      if first.Table1_Start_Date then do;
        Gap_Start_Date = Table1_Start_Date;
        Gap_End_Date   = Table2_Start_Date - 1;
        output;
      end;
      else do;
        Gap_Start_Date = Old_End_Date + 1;
        Gap_End_Date   = Table2_Start_Date - 1;
        output;
      end;
    end;
    If (Table2_End_Date < Table1_End_Date) then do;
      if Last.Table1_Start_Date then do;
        Gap_Start_Date = Table2_End_Date + 1;
        Gap_End_Date   = Table1_End_Date;
        output;
      end;
    end;
  end;
  Old_Start_Date = Table2_Start_Date;
  Old_End_Date   = Table2_End_Date;
  drop Old_Start_Date Old_End_Date;
run;

I have not yet fully confirmed this, but this approach seems to have given me the results I wanted. Any thoughts?

0
source

Source: https://habr.com/ru/post/1783519/


All Articles