I have a dataset like this one here:
BP_ID | VALID_FROM | VALID_TO | LIMIT |
---|---|---|---|
1 | 15.05.1999 | 16.07.2000 | 100 |
1 | 01.01.2020 | 10.01.2020 | 100 |
1 | 10.01.2020 | 31.12.9999 | 100 |
that I wanted to "aggregate" to a single record based on the fact there is no variation in LIMIT over time for this BP_ID:
BP_ID | VALID_FROM | VALID_TO | LIMIT |
---|---|---|---|
1 | 15.05.1999 | 16.07.2000 | 100 |
1 | 01.01.2020 | 31.12.9999 | 100 |
I thought a good idea to achieve this is to use a hierarchical query and find the root valid_from. So I tried:
with pretab as (
select 1 as bp_id,
to_date('15.05.1999', 'dd.mm.yyyy') as valid_from,
to_date('16.07.2000', 'dd.mm.yyyy') as valid_to,
100 as limit
from dual
union
select 1 as bp_id,
to_date('01.01.2020', 'dd.mm.yyyy') as valid_from,
to_date('10.01.2020', 'dd.mm.yyyy') as valid_to,
100 as limit
from dual
union
select 1 as bp_id,
to_date('10.01.2020', 'dd.mm.yyyy') as valid_from,
to_date('31.12.9999', 'dd.mm.yyyy') as valid_to,
100 as limit
from dual
),
pretab2 as (
select t1.*, case when
valid_to != coalesce(lead(valid_from) over (partition by bp_id order by valid_from), to_date('31.12.9999', 'dd.mm.yyyy'))
or limit != coalesce(lead(limit) over (partition by bp_id order by valid_from), 0)
then 1
else 0 end as start_dummy from pretab t1
)
select bp_id, connect_by_root(valid_from), valid_to, limit from pretab2
CONNECT BY PRIOR bp_id = bp_id
and prior trunc(valid_to) = trunc(valid_from)
and prior limit = limit
start with start_dummy = 1;
Unfortunately, the second row does not return 01.01.2020 for the connect_by_root(). Why is that? How would I have to change the query so that it gives me this value?
CodePudding user response:
From Oracle 12, you can use MATCH_RECOGNIZE
to perform row-by-row processing:
with pretab (bp_id, valid_from, valid_to, limit) as (
select 1, DATE '1999-05-15', DATE '2020-01-01', 100 from dual union all
select 1, DATE '2000-01-01', DATE '2000-01-10', 100 from dual union all
select 1, DATE '2000-01-10', DATE '9999-12-31', 100 from dual union all
select 2, DATE '2000-01-01', DATE '2001-12-31', 100 from dual union all
select 2, DATE '2002-01-01', DATE '2002-12-31', 100 from dual union all
select 2, DATE '2003-01-01', DATE '2003-12-31', 200 from dual union all
select 2, DATE '2004-01-01', DATE '2006-12-31', 100 from dual union all
select 2, DATE '2005-01-01', DATE '2005-12-31', 100 from dual
)
SELECT *
FROM pretab
MATCH_RECOGNIZE(
PARTITION BY bp_id
ORDER BY valid_from, valid_to
MEASURES
FIRST(limit) AS limit,
FIRST(valid_from) AS valid_from,
MAX(valid_to) AS valid_to
PATTERN (same_overlapping_limit )
DEFINE
same_overlapping_limit AS
limit = FIRST(limit)
AND valid_from - INTERVAL '1' DAY <= MAX(same_overlapping_limit.valid_to)
)
Which, outputs:
BP_ID LIMIT VALID_FROM VALID_TO 1 100 1999-05-15 00:00:00 9999-12-31 00:00:00 2 100 2000-01-01 00:00:00 2002-12-31 00:00:00 2 200 2003-01-01 00:00:00 2003-12-31 00:00:00 2 100 2004-01-01 00:00:00 2006-12-31 00:00:00
db<>fiddle here
CodePudding user response:
It seems that your start_dummy
calculation is not returning the correct row that should be returned by the query.
If you want to use CONNECT BY
, here is an example that will return the results you are expecting:
WITH
pretab
AS
(SELECT 1 AS bp_id,
TO_DATE ('15.05.1999', 'dd.mm.yyyy') AS valid_from,
TO_DATE ('01.01.2020', 'dd.mm.yyyy') AS valid_to,
100 AS LIMIT
FROM DUAL
UNION
SELECT 1 AS bp_id,
TO_DATE ('01.01.2000', 'dd.mm.yyyy') AS valid_from,
TO_DATE ('10.01.2000', 'dd.mm.yyyy') AS valid_to,
100 AS LIMIT
FROM DUAL
UNION
SELECT 1 AS bp_id,
TO_DATE ('10.01.2000', 'dd.mm.yyyy') AS valid_from,
TO_DATE ('31.12.9999', 'dd.mm.yyyy') AS valid_to,
100 AS LIMIT
FROM DUAL)
SELECT bp_id,
CONNECT_BY_ROOT (valid_from),
valid_to,
LIMIT
FROM pretab
CONNECT BY PRIOR bp_id = bp_id
AND PRIOR TRUNC (valid_to) = TRUNC (valid_from)
AND PRIOR LIMIT = LIMIT
ORDER BY LEVEL DESC
FETCH FIRST 1 ROWS ONLY;
Another solution that will work is to use GROUP BY
. This will only work if you are confident that you will not have any gaps in the dates.
WITH
pretab
AS
(SELECT 1 AS bp_id,
TO_DATE ('15.05.1999', 'dd.mm.yyyy') AS valid_from,
TO_DATE ('01.01.2020', 'dd.mm.yyyy') AS valid_to,
100 AS LIMIT
FROM DUAL
UNION
SELECT 1 AS bp_id,
TO_DATE ('01.01.2000', 'dd.mm.yyyy') AS valid_from,
TO_DATE ('10.01.2000', 'dd.mm.yyyy') AS valid_to,
100 AS LIMIT
FROM DUAL
UNION
SELECT 1 AS bp_id,
TO_DATE ('10.01.2000', 'dd.mm.yyyy') AS valid_from,
TO_DATE ('31.12.9999', 'dd.mm.yyyy') AS valid_to,
100 AS LIMIT
FROM DUAL)
SELECT bp_id,
MIN (valid_from),
MAX (valid_to),
LIMIT
FROM pretab
GROUP BY bp_id, LIMIT;
CodePudding user response:
Again - if your data have no gaps and everything is consecutive as it should be, you could try "sessionization": add a session id that is incremented every time that "limit" (I renamed it to lim as it can become a reserved word) changes.
First, you add a counter that changes if the lim
differs from the previous lim
- using the OLAP LAG()
function.
Second, you use the counter in an outer query to perform a running sum on it. That gives you the session id.
Finally, select from the query with the session id, and get the minimal from date and the maximal to date, grouping by bp_id
, lim
, and session_id
.
WITH
indata(bp_id,valid_from,valid_to,lim) as (
SELECT 1,DATE '1999-05-15',DATE '2020-01-01',100 FROM dual
UNION ALL SELECT 1,DATE '2020-01-01',DATE '2020-01-10',100 FROM dual
UNION ALL SELECT 1,DATE '2020-01-10',DATE '2020-05-15',100 FROM dual
UNION ALL SELECT 1,DATE '2020-05-15',DATE '2021-01-01',200 FROM dual
UNION ALL SELECT 1,DATE '2021-01-01',DATE '2021-01-10',200 FROM dual
UNION ALL SELECT 1,DATE '2021-01-10',DATE '9999-12-31',200 FROM dual
)
,
with_counter AS (
SELECT
bp_id
, valid_from
, valid_to
, lim
, CASE WHEN lim <> LAG(lim) OVER(PARTITION BY bp_id ORDER BY valid_from)
THEN 1
ELSE 0
END AS counter
FROM indata
)
,
with_session_id AS (
SELECT
bp_id
, valid_from
, valid_to
, lim
, SUM(counter) OVER(PARTITION BY bp_id ORDER BY valid_from) AS session_id
FROM with_counter
)
SELECT
bp_id
, MIN(valid_from) AS valid_from
, MAX(valid_to) AS valid_to
, lim
FROM with_session_id
GROUP BY
bp_id
, lim
, session_id;
-- out bp_id | valid_from | valid_to | lim
-- out ------- ------------ ------------ -----
-- out 1 | 1999-05-15 | 2020-05-15 | 100
-- out 1 | 2020-05-15 | 9999-12-31 | 200
CodePudding user response:
Correct CONNECT BY
version could look like
with pretab as (
select 1 as bp_id,
to_date('15.05.1999', 'dd.mm.yyyy') as valid_from,
to_date('01.01.2000', 'dd.mm.yyyy') as valid_to,
100 as limit
from dual
union
select 1 as bp_id,
to_date('01.01.2000', 'dd.mm.yyyy') as valid_from,
to_date('10.01.2000', 'dd.mm.yyyy') as valid_to,
100 as limit
from dual
union
select 1 as bp_id,
to_date('10.01.2000', 'dd.mm.yyyy') as valid_from,
to_date('31.12.9999', 'dd.mm.yyyy') as valid_to,
100 as limit
from dual
union
select 2 as bp_id,
to_date('15.05.1999', 'dd.mm.yyyy') as valid_from,
to_date('01.01.2002', 'dd.mm.yyyy') as valid_to,
200 as limit
from dual
union
select 2 as bp_id,
to_date('01.01.2002', 'dd.mm.yyyy') as valid_from,
to_date('10.01.2002', 'dd.mm.yyyy') as valid_to,
200 as limit
from dual
union
select 2 as bp_id,
to_date('10.01.2002', 'dd.mm.yyyy') as valid_from,
to_date('31.12.9999', 'dd.mm.yyyy') as valid_to,
200 as limit
from dual
),
pretab2 as (
select t1.*, case when
valid_from != lag(valid_to, 1, valid_from - 1) over (partition by bp_id order by valid_from)
or limit != lag(limit, 1, limit - 1) over (partition by bp_id order by valid_from)
then 1
else 0 end as start_dummy
from pretab t1
)
select bp_id, valid_from, valid_to, limit
from (
select bp_id, connect_by_root(valid_from) valid_from, valid_to, limit, row_number() over(partition by bp_id order by level desc) rn
from pretab2
CONNECT BY PRIOR bp_id = bp_id
and prior trunc(valid_to) = trunc(valid_from)
and prior limit = limit
start with start_dummy = 1
) where rn = 1;
Returns
BP_ID VALID_FROM VALID_TO LIMIT
1 1999-05-15 00:00:00 9999-12-31 00:00:00 100
2 1999-05-15 00:00:00 9999-12-31 00:00:00 200