I'm trying to fill nulls in a table with data from the most recent record within a 10-day lookback window in Snowflake SQL.
CREATE TABLE activities
(
activity_id NUMBER,
activity_datetime DATE,
offer_id VARCHAR,
member_id NUMBER
);
INSERT INTO activities (activity_id, activity_datetime, offer_id, member_id)
VALUES (1, '2022-10-01', '1111', 10001)
, (2, '2022-10-05', '5555', 10001)
, (3, '2022-10-09', NULL, 10001)
, (4, '2022-10-09', NULL, 10001)
, (5, '2022-10-13', NULL, 10001)
, (6, '2022-10-13', NULL, 10001)
, (7, '2022-10-17', '18887', 10001)
, (8, '2022-10-21', '23331', 10001)
, (9, '2022-10-25', '27775', 10001)
, (10, '2022-10-29', '32219', 10001)
, (11, '2022-10-01', '1111', 20001)
, (12, '2022-10-05', '5555', 20001)
, (13, '2022-10-09', NULL, 20001)
, (14, '2022-10-09', NULL, 20001)
, (15, '2022-10-13', NULL, 20001)
, (16, '2022-10-13', NULL, 20001)
, (17, '2022-10-17', '18887', 20001)
, (18, '2022-10-21', '23331', 20001)
, (19, '2022-10-25', '27775', 20001)
, (20, '2022-10-29', '32219', 20001);
TABLE:
ACTIVITY_ID | ACTIVITY_DATETIME | OFFER_ID | MEMBER_ID |
---|---|---|---|
1 | 2022-10-01 | 1111 | 10001 |
2 | 2022-10-05 | 5555 | 10001 |
3 | 2022-10-09 | null | 10001 |
4 | 2022-10-09 | null | 10001 |
5 | 2022-10-17 | null | 10001 |
6 | 2022-10-17 | null | 10001 |
7 | 2022-10-19 | 18887 | 10001 |
8 | 2022-10-21 | 23331 | 10001 |
9 | 2022-10-25 | 27775 | 10001 |
10 | 2022-10-29 | 32219 | 10001 |
11 | 2022-10-01 | 1111 | 20001 |
12 | 2022-10-05 | 5555 | 20001 |
13 | 2022-10-09 | null | 20001 |
14 | 2022-10-09 | null | 20001 |
15 | 2022-10-17 | null | 20001 |
16 | 2022-10-17 | null | 20001 |
17 | 2022-10-19 | 18887 | 20001 |
18 | 2022-10-21 | 23331 | 20001 |
19 | 2022-10-25 | 27775 | 20001 |
20 | 2022-10-29 | 32219 | 20001 |
DESIRED RESULT:
ACTIVITY_ID | ACTIVITY_DATETIME | OFFER_ID | MEMBER_ID |
---|---|---|---|
1 | 2022-10-01 | 1111 | 10001 |
2 | 2022-10-05 | 5555 | 10001 |
3 | 2022-10-09 | 5555 | 10001 |
4 | 2022-10-09 | 5555 | 10001 |
5 | 2022-10-17 | null | 10001 |
6 | 2022-10-17 | null | 10001 |
7 | 2022-10-19 | 18887 | 10001 |
8 | 2022-10-21 | 23331 | 10001 |
9 | 2022-10-25 | 27775 | 10001 |
10 | 2022-10-29 | 32219 | 10001 |
11 | 2022-10-01 | 1111 | 20001 |
12 | 2022-10-05 | 5555 | 20001 |
13 | 2022-10-09 | 5555 | 20001 |
14 | 2022-10-09 | 5555 | 20001 |
15 | 2022-10-17 | null | 20001 |
16 | 2022-10-17 | null | 20001 |
17 | 2022-10-19 | 18887 | 20001 |
18 | 2022-10-21 | 23331 | 20001 |
19 | 2022-10-25 | 27775 | 20001 |
20 | 2022-10-29 | 32219 | 20001 |
The query below seems to be close but I cannot figure out how to produce the results in an efficient way. The query isn't great because it duplicates results for each of the null rows instead of producing the most recent record within the 10 day lookback window.
WITH activity_nulls AS (SELECT *
FROM activities
WHERE offer_id IS NULL)
, activity_non_null AS (SELECT *
FROM activities
WHERE offer_id IS NOT NULL)
SELECT activity_nulls.activity_id actvity_id_nulls
, activity_nulls.activity_datetime dt_nulls
, activity_non_null.offer_id
, activity_non_null.activity_datetime
FROM activity_nulls
INNER JOIN activity_non_null
ON activity_non_null.member_id = activity_nulls.member_id
WHERE activity_non_null.activity_datetime BETWEEN DATEADD(DAY, -14, activity_non_null.activity_datetime)
AND activity_non_null.activity_datetime;
RESULT:
ACTVITY_ID_NULLS | DT_NULLS | OFFER_ID | ACTIVITY_DATETIME |
---|---|---|---|
3 | 2022-10-09 | 1111 | 2022-10-01 |
3 | 2022-10-09 | 5555 | 2022-10-05 |
4 | 2022-10-09 | 1111 | 2022-10-01 |
4 | 2022-10-09 | 5555 | 2022-10-05 |
5 | 2022-10-13 | 1111 | 2022-10-01 |
5 | 2022-10-13 | 5555 | 2022-10-05 |
6 | 2022-10-13 | 1111 | 2022-10-01 |
6 | 2022-10-13 | 5555 | 2022-10-05 |
13 | 2022-10-09 | 5555 | 2022-10-05 |
13 | 2022-10-09 | 1111 | 2022-10-01 |
14 | 2022-10-09 | 5555 | 2022-10-05 |
14 | 2022-10-09 | 1111 | 2022-10-01 |
15 | 2022-10-13 | 5555 | 2022-10-05 |
15 | 2022-10-13 | 1111 | 2022-10-01 |
16 | 2022-10-13 | 5555 | 2022-10-05 |
16 | 2022-10-13 | 1111 | 2022-10-01 |
The query should only be producing a single row with the '5555' activity_id
since it is the most recent record within the 10 day lookback window.
CodePudding user response:
Joins alone are probably not going to get you there. This is a problem that calls out for window functions. The following CTE is in two parts. Part 1 - lag() back to the last non-null offer ID. Part 2 - Find out if the lagged non-null value is within 10 days.
with FILLED_DATES as
(
select activity_id
,activity_datetime
,offer_id
,lag(offer_id, 1, offer_id) ignore nulls over (partition by member_id order by activity_id) as LAGGED_OFFER_ID
,conditional_true_event(OFFER_ID is not null) over (partition by member_id order by activity_id) as DATE_GROUP
,member_id
from ACTIVITIES
)
select activity_id
,activity_datetime
,case
when offer_id is null and
activity_datetime - min(activity_datetime) over (partition by MEMBER_ID, DATE_GROUP order by activity_id) <= 10
then lagged_offer_id
else offer_id
end as offer_id
,MEMBER_ID
from FILLED_DATES order by ACTIVITY_ID
;
ACTIVITY_ID | ACTIVITY_DATETIME | OFFER_ID | MEMBER_ID |
---|---|---|---|
1 | 2022-10-01 00:00:00 | 1111 | 10001 |
2 | 2022-10-05 00:00:00 | 5555 | 10001 |
3 | 2022-10-09 00:00:00 | 5555 | 10001 |
4 | 2022-10-09 00:00:00 | 5555 | 10001 |
5 | 2022-10-17 00:00:00 | null | 10001 |
6 | 2022-10-17 00:00:00 | null | 10001 |
7 | 2022-10-19 00:00:00 | 18887 | 10001 |
8 | 2022-10-21 00:00:00 | 23331 | 10001 |
9 | 2022-10-25 00:00:00 | 27775 | 10001 |
10 | 2022-10-29 00:00:00 | 32219 | 10001 |
11 | 2022-10-01 00:00:00 | 1111 | 20001 |
12 | 2022-10-05 00:00:00 | 5555 | 20001 |
13 | 2022-10-09 00:00:00 | 5555 | 20001 |
14 | 2022-10-09 00:00:00 | 5555 | 20001 |
15 | 2022-10-17 00:00:00 | null | 20001 |
16 | 2022-10-17 00:00:00 | null | 20001 |
17 | 2022-10-19 00:00:00 | 18887 | 20001 |
18 | 2022-10-21 00:00:00 | 23331 | 20001 |
19 | 2022-10-25 00:00:00 | 27775 | 20001 |
20 | 2022-10-29 00:00:00 | 32219 | 20001 |