I have data with the following structure in BigQuery:
select
1 as user_id, date('2021-01-01') as transaction_date, 1 as value
union all (select 1, '2021-01-02', 2)
union all (select 1, '2021-01-05', 2)
union all (select 1, '2021-02-01', 2)
union all (select 1, '2021-02-03', 2)
union all (select 2, '2021-01-02', 2)
union all (select 2, '2021-02-01', 2)
union all (select 2, '2021-02-03', 3)
I want, for every user and month, to fill in the missing values between the first and last days of the month that we have data in. The output for this pair of users should be:
1,2021-01-01,1
1,2021-01-02,2
1,2021-01-03,null
1,2021-01-04,null
1,2021-01-05,2
1,2021-02-01,2
1,2021-02-02,null
1,2021-02-03,2
2,2021-01-02,2
2,2021-02-01,2
2,2021-02-02,null
2,2021-02-03,3
What's the simplest way to do this in bigquery?
CodePudding user response:
One way of doing it would be using GENERATE_ARRAY()
and UNNEST
to make a template, and LEFT OUTER JOIN
ing the template with the original data.
query
WITH org_data AS (
select 1 as user_id, date('2021-01-01') as transaction_date, 1 as value
union all (select 1, '2021-01-02', 2) union all (select 1, '2021-01-05', 2)
union all (select 1, '2021-02-01', 2) union all (select 1, '2021-02-03', 2)
union all (select 2, '2021-01-02', 2) union all (select 2, '2021-02-01', 2)
union all (select 2, '2021-02-03', 3)
),
candidate_user_month AS (
SELECT
user_id,
EXTRACT(YEAR FROM transaction_date) AS transaction_year,
EXTRACT(MONTH FROM transaction_date) AS transaction_month,
COUNT(1) as date_count,
MIN(EXTRACT(DAY FROM transaction_date)) as min_day,
MAX(EXTRACT(DAY FROM transaction_date)) as max_day,
FROM org_data
GROUP BY user_id, transaction_year, transaction_month
HAVING date_count >= 2
),
template AS (
WITH
date_array AS (
SELECT
user_id, transaction_year, transaction_month,
GENERATE_ARRAY(min_day, max_day, 1) as dates,
FROM candidate_user_month
)
SELECT
user_id,
DATE(transaction_year, transaction_month, dates) as transaction_date,
NULL as value_with_null,
FROM date_array, UNNEST(dates) as dates
)
SELECT user_id, transaction_date, value,
FROM template
FULL OUTER JOIN org_data USING (user_id, transaction_date)
ORDER BY user_id, transaction_date
;