Home > database >  Hierarchical query: connect_by_root does not give root value
Hierarchical query: connect_by_root does not give root value

Time:05-20

I have a dataset like this one here:

BP_ID VALID_FROM VALID_TO LIMIT
1 15.05.1999 16.07.2000 100
1 01.01.2020 10.01.2020 100
1 10.01.2020 31.12.9999 100

that I wanted to "aggregate" to a single record based on the fact there is no variation in LIMIT over time for this BP_ID:

BP_ID VALID_FROM VALID_TO LIMIT
1 15.05.1999 16.07.2000 100
1 01.01.2020 31.12.9999 100

I thought a good idea to achieve this is to use a hierarchical query and find the root valid_from. So I tried:

with pretab as (

select 1 as bp_id, 
       to_date('15.05.1999', 'dd.mm.yyyy') as valid_from,
       to_date('16.07.2000', 'dd.mm.yyyy') as valid_to,
       100 as limit 
  from dual
union
    select 1 as bp_id, 
           to_date('01.01.2020', 'dd.mm.yyyy') as valid_from,
           to_date('10.01.2020', 'dd.mm.yyyy') as valid_to,
           100 as limit 
      from dual
union
    select 1 as bp_id, 
           to_date('10.01.2020', 'dd.mm.yyyy') as valid_from,
          to_date('31.12.9999', 'dd.mm.yyyy') as valid_to,
           100 as limit 
      from dual

),
pretab2 as (
select t1.*, case when 
                  valid_to != coalesce(lead(valid_from) over (partition by bp_id order by valid_from), to_date('31.12.9999', 'dd.mm.yyyy'))
                  or limit !=  coalesce(lead(limit) over (partition by bp_id order by valid_from), 0)                  
              then 1 
              else 0 end as start_dummy from pretab t1
)


select bp_id, connect_by_root(valid_from), valid_to, limit from pretab2
CONNECT BY PRIOR bp_id = bp_id
and prior trunc(valid_to) = trunc(valid_from)
and prior limit = limit
start with start_dummy = 1;

Unfortunately, the second row does not return 01.01.2020 for the connect_by_root(). Why is that? How would I have to change the query so that it gives me this value?

CodePudding user response:

From Oracle 12, you can use MATCH_RECOGNIZE to perform row-by-row processing:

with pretab (bp_id, valid_from, valid_to, limit) as (
  select 1, DATE '1999-05-15', DATE '2020-01-01', 100 from dual union all
  select 1, DATE '2000-01-01', DATE '2000-01-10', 100 from dual union all
  select 1, DATE '2000-01-10', DATE '9999-12-31', 100 from dual union all
  select 2, DATE '2000-01-01', DATE '2001-12-31', 100 from dual union all
  select 2, DATE '2002-01-01', DATE '2002-12-31', 100 from dual union all
  select 2, DATE '2003-01-01', DATE '2003-12-31', 200 from dual union all
  select 2, DATE '2004-01-01', DATE '2006-12-31', 100 from dual union all
  select 2, DATE '2005-01-01', DATE '2005-12-31', 100 from dual
)
SELECT *
FROM   pretab
MATCH_RECOGNIZE(
  PARTITION BY bp_id
  ORDER     BY valid_from, valid_to
  MEASURES
    FIRST(limit)      AS limit,
    FIRST(valid_from) AS valid_from,
    MAX(valid_to)     AS valid_to
  PATTERN (same_overlapping_limit )
  DEFINE
    same_overlapping_limit AS
          limit      = FIRST(limit)
      AND valid_from - INTERVAL '1' DAY <= MAX(same_overlapping_limit.valid_to)
)

Which, outputs:

BP_ID LIMIT VALID_FROM VALID_TO
1 100 1999-05-15 00:00:00 9999-12-31 00:00:00
2 100 2000-01-01 00:00:00 2002-12-31 00:00:00
2 200 2003-01-01 00:00:00 2003-12-31 00:00:00
2 100 2004-01-01 00:00:00 2006-12-31 00:00:00

db<>fiddle here

CodePudding user response:

It seems that your start_dummy calculation is not returning the correct row that should be returned by the query.

If you want to use CONNECT BY, here is an example that will return the results you are expecting:

WITH
    pretab
    AS
        (SELECT 1                                        AS bp_id,
                TO_DATE ('15.05.1999', 'dd.mm.yyyy')     AS valid_from,
                TO_DATE ('01.01.2020', 'dd.mm.yyyy')     AS valid_to,
                100                                      AS LIMIT
           FROM DUAL
         UNION
         SELECT 1                                        AS bp_id,
                TO_DATE ('01.01.2000', 'dd.mm.yyyy')     AS valid_from,
                TO_DATE ('10.01.2000', 'dd.mm.yyyy')     AS valid_to,
                100                                      AS LIMIT
           FROM DUAL
         UNION
         SELECT 1                                        AS bp_id,
                TO_DATE ('10.01.2000', 'dd.mm.yyyy')     AS valid_from,
                TO_DATE ('31.12.9999', 'dd.mm.yyyy')     AS valid_to,
                100                                      AS LIMIT
           FROM DUAL)
    SELECT bp_id,
           CONNECT_BY_ROOT (valid_from),
           valid_to,
           LIMIT
      FROM pretab
CONNECT BY     PRIOR bp_id = bp_id
           AND PRIOR TRUNC (valid_to) = TRUNC (valid_from)
           AND PRIOR LIMIT = LIMIT
  ORDER BY LEVEL DESC
     FETCH FIRST 1 ROWS ONLY;

Another solution that will work is to use GROUP BY. This will only work if you are confident that you will not have any gaps in the dates.

WITH
    pretab
    AS
        (SELECT 1                                        AS bp_id,
                TO_DATE ('15.05.1999', 'dd.mm.yyyy')     AS valid_from,
                TO_DATE ('01.01.2020', 'dd.mm.yyyy')     AS valid_to,
                100                                      AS LIMIT
           FROM DUAL
         UNION
         SELECT 1                                        AS bp_id,
                TO_DATE ('01.01.2000', 'dd.mm.yyyy')     AS valid_from,
                TO_DATE ('10.01.2000', 'dd.mm.yyyy')     AS valid_to,
                100                                      AS LIMIT
           FROM DUAL
         UNION
         SELECT 1                                        AS bp_id,
                TO_DATE ('10.01.2000', 'dd.mm.yyyy')     AS valid_from,
                TO_DATE ('31.12.9999', 'dd.mm.yyyy')     AS valid_to,
                100                                      AS LIMIT
           FROM DUAL)
  SELECT bp_id,
         MIN (valid_from),
         MAX (valid_to),
         LIMIT
    FROM pretab
GROUP BY bp_id, LIMIT;

CodePudding user response:

Again - if your data have no gaps and everything is consecutive as it should be, you could try "sessionization": add a session id that is incremented every time that "limit" (I renamed it to lim as it can become a reserved word) changes. First, you add a counter that changes if the lim differs from the previous lim - using the OLAP LAG() function. Second, you use the counter in an outer query to perform a running sum on it. That gives you the session id. Finally, select from the query with the session id, and get the minimal from date and the maximal to date, grouping by bp_id, lim, and session_id.

WITH
indata(bp_id,valid_from,valid_to,lim) as (                                             
          SELECT 1,DATE '1999-05-15',DATE '2020-01-01',100 FROM dual
UNION ALL SELECT 1,DATE '2020-01-01',DATE '2020-01-10',100 FROM dual
UNION ALL SELECT 1,DATE '2020-01-10',DATE '2020-05-15',100 FROM dual
UNION ALL SELECT 1,DATE '2020-05-15',DATE '2021-01-01',200 FROM dual
UNION ALL SELECT 1,DATE '2021-01-01',DATE '2021-01-10',200 FROM dual
UNION ALL SELECT 1,DATE '2021-01-10',DATE '9999-12-31',200 FROM dual
)
,
with_counter AS (
  SELECT
    bp_id
  , valid_from
  , valid_to
  , lim
  , CASE WHEN lim <> LAG(lim) OVER(PARTITION BY bp_id ORDER BY valid_from)
      THEN 1
      ELSE 0
    END AS counter
  FROM indata
)
,
with_session_id AS (
  SELECT
    bp_id
  , valid_from
  , valid_to
  , lim
  , SUM(counter) OVER(PARTITION BY bp_id ORDER BY valid_from) AS session_id
  FROM with_counter
)
SELECT
  bp_id
, MIN(valid_from) AS valid_from
, MAX(valid_to)   AS valid_to
, lim
FROM with_session_id
GROUP BY
  bp_id
, lim
, session_id;
-- out  bp_id | valid_from |  valid_to  | lim 
-- out ------- ------------ ------------ -----
-- out      1 | 1999-05-15 | 2020-05-15 | 100
-- out      1 | 2020-05-15 | 9999-12-31 | 200

CodePudding user response:

Correct CONNECT BY version could look like

with pretab as (

select 1 as bp_id, 
       to_date('15.05.1999', 'dd.mm.yyyy') as valid_from,
       to_date('01.01.2000', 'dd.mm.yyyy') as valid_to,
       100 as limit 
  from dual
union
    select 1 as bp_id, 
           to_date('01.01.2000', 'dd.mm.yyyy') as valid_from,
           to_date('10.01.2000', 'dd.mm.yyyy') as valid_to,
           100 as limit 
      from dual
union
    select 1 as bp_id, 
           to_date('10.01.2000', 'dd.mm.yyyy') as valid_from,
          to_date('31.12.9999', 'dd.mm.yyyy') as valid_to,
           100 as limit 
      from dual
union
select 2 as bp_id, 
       to_date('15.05.1999', 'dd.mm.yyyy') as valid_from,
       to_date('01.01.2002', 'dd.mm.yyyy') as valid_to,
       200 as limit 
  from dual
union
    select 2 as bp_id, 
           to_date('01.01.2002', 'dd.mm.yyyy') as valid_from,
           to_date('10.01.2002', 'dd.mm.yyyy') as valid_to,
           200 as limit 
      from dual
union
    select 2 as bp_id, 
           to_date('10.01.2002', 'dd.mm.yyyy') as valid_from,
          to_date('31.12.9999', 'dd.mm.yyyy') as valid_to,
           200 as limit 
      from dual
),
pretab2 as (
select t1.*, case when 
                  valid_from != lag(valid_to, 1, valid_from - 1) over (partition by bp_id order by valid_from)
                  or limit != lag(limit, 1, limit - 1) over (partition by bp_id order by valid_from)                  
              then 1 
              else 0 end as start_dummy 
from pretab t1
)
select bp_id, valid_from, valid_to, limit
from (
   select bp_id, connect_by_root(valid_from) valid_from, valid_to, limit, row_number() over(partition by bp_id order by level desc) rn
   from pretab2
   CONNECT BY PRIOR bp_id = bp_id
      and prior trunc(valid_to) = trunc(valid_from)
      and prior limit = limit 
   start with start_dummy = 1
) where rn = 1;

Returns

BP_ID   VALID_FROM  VALID_TO    LIMIT
1   1999-05-15 00:00:00 9999-12-31 00:00:00 100
2   1999-05-15 00:00:00 9999-12-31 00:00:00 200
  • Related