Home > database >  Fill nulls with data from previous record within a 10-day lookback window - Snowflake SQL
Fill nulls with data from previous record within a 10-day lookback window - Snowflake SQL

Time:11-11

I'm trying to fill nulls in a table with data from the most recent record within a 10-day lookback window in Snowflake SQL.

CREATE TABLE activities
(
    activity_id       NUMBER,
    activity_datetime DATE,
    offer_id          VARCHAR,
    member_id         NUMBER
);


INSERT INTO activities (activity_id, activity_datetime, offer_id, member_id)
VALUES (1, '2022-10-01', '1111', 10001)
     , (2, '2022-10-05', '5555', 10001)
     , (3, '2022-10-09', NULL, 10001)
     , (4, '2022-10-09', NULL, 10001)
     , (5, '2022-10-13', NULL, 10001)
     , (6, '2022-10-13', NULL, 10001)
     , (7, '2022-10-17', '18887', 10001)
     , (8, '2022-10-21', '23331', 10001)
     , (9, '2022-10-25', '27775', 10001)
     , (10, '2022-10-29', '32219', 10001)
     , (11, '2022-10-01', '1111', 20001)
     , (12, '2022-10-05', '5555', 20001)
     , (13, '2022-10-09', NULL, 20001)
     , (14, '2022-10-09', NULL, 20001)
     , (15, '2022-10-13', NULL, 20001)
     , (16, '2022-10-13', NULL, 20001)
     , (17, '2022-10-17', '18887', 20001)
     , (18, '2022-10-21', '23331', 20001)
     , (19, '2022-10-25', '27775', 20001)
     , (20, '2022-10-29', '32219', 20001);

TABLE:

ACTIVITY_ID ACTIVITY_DATETIME OFFER_ID MEMBER_ID
1 2022-10-01 1111 10001
2 2022-10-05 5555 10001
3 2022-10-09 null 10001
4 2022-10-09 null 10001
5 2022-10-17 null 10001
6 2022-10-17 null 10001
7 2022-10-19 18887 10001
8 2022-10-21 23331 10001
9 2022-10-25 27775 10001
10 2022-10-29 32219 10001
11 2022-10-01 1111 20001
12 2022-10-05 5555 20001
13 2022-10-09 null 20001
14 2022-10-09 null 20001
15 2022-10-17 null 20001
16 2022-10-17 null 20001
17 2022-10-19 18887 20001
18 2022-10-21 23331 20001
19 2022-10-25 27775 20001
20 2022-10-29 32219 20001

DESIRED RESULT:

ACTIVITY_ID ACTIVITY_DATETIME OFFER_ID MEMBER_ID
1 2022-10-01 1111 10001
2 2022-10-05 5555 10001
3 2022-10-09 5555 10001
4 2022-10-09 5555 10001
5 2022-10-17 null 10001
6 2022-10-17 null 10001
7 2022-10-19 18887 10001
8 2022-10-21 23331 10001
9 2022-10-25 27775 10001
10 2022-10-29 32219 10001
11 2022-10-01 1111 20001
12 2022-10-05 5555 20001
13 2022-10-09 5555 20001
14 2022-10-09 5555 20001
15 2022-10-17 null 20001
16 2022-10-17 null 20001
17 2022-10-19 18887 20001
18 2022-10-21 23331 20001
19 2022-10-25 27775 20001
20 2022-10-29 32219 20001

The query below seems to be close but I cannot figure out how to produce the results in an efficient way. The query isn't great because it duplicates results for each of the null rows instead of producing the most recent record within the 10 day lookback window.

WITH activity_nulls AS (SELECT *
                   FROM activities
                   WHERE offer_id IS NULL)
   , activity_non_null AS (SELECT *
                  FROM activities
                  WHERE offer_id IS NOT NULL)
SELECT activity_nulls.activity_id       actvity_id_nulls
     , activity_nulls.activity_datetime dt_nulls
     , activity_non_null.offer_id
     , activity_non_null.activity_datetime
FROM activity_nulls
         INNER JOIN activity_non_null
                    ON activity_non_null.member_id = activity_nulls.member_id
WHERE activity_non_null.activity_datetime BETWEEN DATEADD(DAY, -14, activity_non_null.activity_datetime)
          AND activity_non_null.activity_datetime;

RESULT:

ACTVITY_ID_NULLS DT_NULLS OFFER_ID ACTIVITY_DATETIME
3 2022-10-09 1111 2022-10-01
3 2022-10-09 5555 2022-10-05
4 2022-10-09 1111 2022-10-01
4 2022-10-09 5555 2022-10-05
5 2022-10-13 1111 2022-10-01
5 2022-10-13 5555 2022-10-05
6 2022-10-13 1111 2022-10-01
6 2022-10-13 5555 2022-10-05
13 2022-10-09 5555 2022-10-05
13 2022-10-09 1111 2022-10-01
14 2022-10-09 5555 2022-10-05
14 2022-10-09 1111 2022-10-01
15 2022-10-13 5555 2022-10-05
15 2022-10-13 1111 2022-10-01
16 2022-10-13 5555 2022-10-05
16 2022-10-13 1111 2022-10-01

The query should only be producing a single row with the '5555' activity_id since it is the most recent record within the 10 day lookback window.

CodePudding user response:

Joins alone are probably not going to get you there. This is a problem that calls out for window functions. The following CTE is in two parts. Part 1 - lag() back to the last non-null offer ID. Part 2 - Find out if the lagged non-null value is within 10 days.

with FILLED_DATES as
(
select   activity_id
        ,activity_datetime
        ,offer_id
        ,lag(offer_id, 1, offer_id) ignore nulls over (partition by member_id order by activity_id) as LAGGED_OFFER_ID
        ,conditional_true_event(OFFER_ID is not null) over (partition by member_id order by activity_id) as DATE_GROUP
        ,member_id
from     ACTIVITIES
)
select   activity_id
        ,activity_datetime
        ,case
            when offer_id is null and
                 activity_datetime - min(activity_datetime) over (partition by MEMBER_ID, DATE_GROUP order by activity_id) <= 10 
            then lagged_offer_id
            else offer_id
         end as offer_id
        ,MEMBER_ID
from FILLED_DATES order by ACTIVITY_ID
;
ACTIVITY_ID ACTIVITY_DATETIME OFFER_ID MEMBER_ID
1 2022-10-01 00:00:00 1111 10001
2 2022-10-05 00:00:00 5555 10001
3 2022-10-09 00:00:00 5555 10001
4 2022-10-09 00:00:00 5555 10001
5 2022-10-17 00:00:00 null 10001
6 2022-10-17 00:00:00 null 10001
7 2022-10-19 00:00:00 18887 10001
8 2022-10-21 00:00:00 23331 10001
9 2022-10-25 00:00:00 27775 10001
10 2022-10-29 00:00:00 32219 10001
11 2022-10-01 00:00:00 1111 20001
12 2022-10-05 00:00:00 5555 20001
13 2022-10-09 00:00:00 5555 20001
14 2022-10-09 00:00:00 5555 20001
15 2022-10-17 00:00:00 null 20001
16 2022-10-17 00:00:00 null 20001
17 2022-10-19 00:00:00 18887 20001
18 2022-10-21 00:00:00 23331 20001
19 2022-10-25 00:00:00 27775 20001
20 2022-10-29 00:00:00 32219 20001
  • Related