I need an unique ID for consecutive dates
source table
SNAPSHOT_DATE | CHANNEL | CASE_ID |
---|---|---|
2022-10-18 | web | 521nzT3HQA |
2022-10-19 | web | 521nzT3HQA |
2022-10-20 | web | 521nzT3HQA |
2022-10-23 | web | 521nzT3HQA |
2022-10-24 | web | 521nzT3HQA |
2022-10-25 | web | 521nzT3HQA |
2022-10-18 | phone | 521nzT3HQA |
2022-10-19 | phone | 521nzT3HQA |
2022-10-21 | phone | 521nzT3HQA |
2022-10-22 | phone | 521nzT3HQA |
2022-10-18 | phone | 52LnlJQAS |
2022-10-26 | phone | 52LnlJQAS |
2022-10-20 | phone | 521nzT3HQA |
2022-10-24 | phone | 521nzT3HQA |
2022-10-25 | phone | 521nzT3HQA |
I tried this query
Select snapshot_date, channel,case_id
,case_id||channel||Dateadd('day', -(row_number() over (partition by case_id, channel order by snapshot_date)), snapshot_date 1) as ID
From test
got output
SNAPSHOT_DATE | CHANNEL | CASE_ID | ID |
---|---|---|---|
2022-10-18 | phone | 521nzT3HQA | 521nzT3HQAphone2022-10-18 |
2022-10-19 | phone | 521nzT3HQA | 521nzT3HQAphone2022-10-18 |
2022-10-20 | phone | 521nzT3HQA | 521nzT3HQAphone2022-10-18 |
2022-10-21 | phone | 521nzT3HQA | 521nzT3HQAphone2022-10-18 |
2022-10-22 | phone | 521nzT3HQA | 521nzT3HQAphone2022-10-18 |
2022-10-24 | phone | 521nzT3HQA | 521nzT3HQAphone2022-10-19 |
2022-10-25 | phone | 521nzT3HQA | 521nzT3HQAphone2022-10-19 |
2022-10-18 | web | 521nzT3HQA | 521nzT3HQAweb2022-10-18 |
2022-10-19 | web | 521nzT3HQA | 521nzT3HQAweb2022-10-18 |
2022-10-20 | web | 521nzT3HQA | 521nzT3HQAweb2022-10-18 |
2022-10-23 | web | 521nzT3HQA | 521nzT3HQAweb2022-10-20 |
2022-10-24 | web | 521nzT3HQA | 521nzT3HQAweb2022-10-20 |
2022-10-25 | web | 521nzT3HQA | 521nzT3HQAweb2022-10-20 |
2022-10-18 | phone | 52LnlJQAS | 52LnlJQASphone2022-10-18 |
2022-10-26 | phone | 52LnlJQAS | 52LnlJQASphone2022-10-25 |
expected output
SNAPSHOT_DATE | CHANNEL | CASE_ID | ID |
---|---|---|---|
2022-10-18 | phone | 521nzT3HQA | 521nzT3HQAphone2022-10-18 |
2022-10-19 | phone | 521nzT3HQA | 521nzT3HQAphone2022-10-18 |
2022-10-20 | phone | 521nzT3HQA | 521nzT3HQAphone2022-10-18 |
2022-10-21 | phone | 521nzT3HQA | 521nzT3HQAphone2022-10-18 |
2022-10-22 | phone | 521nzT3HQA | 521nzT3HQAphone2022-10-18 |
2022-10-24 | phone | 521nzT3HQA | 521nzT3HQAphone2022-10-24 |
2022-10-25 | phone | 521nzT3HQA | 521nzT3HQAphone2022-10-24 |
2022-10-18 | web | 521nzT3HQA | 521nzT3HQAweb2022-10-18 |
2022-10-19 | web | 521nzT3HQA | 521nzT3HQAweb2022-10-18 |
2022-10-20 | web | 521nzT3HQA | 521nzT3HQAweb2022-10-18 |
2022-10-23 | web | 521nzT3HQA | 521nzT3HQAweb2022-10-23 |
2022-10-24 | web | 521nzT3HQA | 521nzT3HQAweb2022-10-23 |
2022-10-25 | web | 521nzT3HQA | 521nzT3HQAweb2022-10-23 |
2022-10-18 | phone | 52LnlJQAS | 52LnlJQASphone2022-10-18 |
2022-10-26 | phone | 52LnlJQAS | 52LnlJQASphone2022-10-26 |
CodePudding user response:
This is the typical problem of searching for gaps.
Something like this should work:
WITH sample_data AS (
SELECT SNAPSHOT_DATE::date AS SNAPSHOT_DATE, CHANNEL, CASE_ID
FROM (VALUES ('2022-10-18', 'web', '521nzT3HQA'),
('2022-10-19', 'web', '521nzT3HQA'),
('2022-10-20', 'web', '521nzT3HQA'),
('2022-10-23', 'web', '521nzT3HQA'),
('2022-10-24', 'web', '521nzT3HQA'),
('2022-10-25', 'web', '521nzT3HQA'),
('2022-10-18', 'phone', '521nzT3HQA'),
('2022-10-19', 'phone', '521nzT3HQA'),
('2022-10-21', 'phone', '521nzT3HQA'),
('2022-10-22', 'phone', '521nzT3HQA'),
('2022-10-18', 'phone', '52LnlJQAS' ),
('2022-10-26', 'phone', '52LnlJQAS' ),
('2022-10-20', 'phone', '521nzT3HQA'),
('2022-10-24', 'phone', '521nzT3HQA'),
('2022-10-25', 'phone', '521nzT3HQA')) T(SNAPSHOT_DATE, CHANNEL, CASE_ID)
), grp AS (
SELECT SNAPSHOT_DATE, CHANNEL, CASE_ID
, MIN(SNAPSHOT_DATE) OVER(PARTITION BY CASE_ID, CHANNEL) AS MIN_SNAPSHOT_DATE
, DENSE_RANK() OVER(PARTITION BY CASE_ID, CHANNEL ORDER BY SNAPSHOT_DATE)
DATEDIFF(DAY, SNAPSHOT_DATE, MIN_SNAPSHOT_DATE) AS DIFF
FROM sample_data
)
SELECT SNAPSHOT_DATE, CHANNEL, CASE_ID
, CASE_ID || CHANNEL || MIN(SNAPSHOT_DATE) OVER(PARTITION BY CASE_ID, CHANNEL, DIFF) AS ID
FROM grp
ORDER BY CASE_ID, CHANNEL, SNAPSHOT_DATE;
CodePudding user response:
The following query solves your issue. It utilizes lag for detecting non consecutive dates within a group of channel and case_id. Further a rolling sum is employed to create a flag, which is used for grouping in order to get the oldest snapshot date within a consecutive date range.
select snapshot_date
, channel
, case_id
-- get the minimum date over channel, case_id and the group flag
, case_id ||channel || min(snapshot_date) over (partition by channel, case_id, grp_flag) as compound_id
from (
select sum(non_cons_dt) over( partition by channel, case_id order by snapshot_date rows between unbounded preceding and current row) as grp_flag --rolling sum as group flag, will increment on non consecutive date
, x.*
from (
select
iff(dateadd('day', 1, lag(snapshot_date) over(partition by channel, case_id order by snapshot_date)) = snapshot_date, 0, 1) as non_cons_dt -- set 1 if there is a non consecutive date within the group
, snapshot_date
, channel
, case_id
from test
) x
) y