Input
session_id |user_id|session_time
s1 | u1 |2021-01-01 00:00:00
s1 | u1 |2021-01-01 00:00:01
s1 | u1 |2021-01-01 00:00:02
s1 | u1 |2021-01-01 00:00:03
s1 | u2 |2021-01-01 00:00:04
s1 | u2 |2021-01-01 00:00:05
s1 | u2 |2021-01-01 00:00:06
s1 | u2 |2021-01-01 00:00:07
s1 | u1 |2021-01-01 00:00:08
s1 | u1 |2021-01-01 00:00:09
s1 | u1 |2021-01-01 00:00:10
s1 | u1 |2021-01-01 00:00:11
Output
session_id|user_id|session_start_time|session_end_time
s1 |u1 |2021-01-01 0:00:00|2021-01-01 0:00:03
s1 |u2 |2021-01-01 0:00:04|2021-01-01 0:00:07
s1 |u1 |2021-01-01 0:00:08|2021-01-01 0:00:11
We need to write a SQL query to get the above output from the input data without using LEAD() and LAG().
CodePudding user response:
I tried to use running count distinct user session
with mydata as(--test data, use your table instead of this CTE
select 's1' session_id,'u1' user_id,'2021-01-01 0:00:00' session_time union all
select 's1','u1','2021-01-01 0:00:01' union all
select 's1','u1','2021-01-01 0:00:02' union all
select 's1','u1','2021-01-01 0:00:03' union all
select 's1','u2','2021-01-01 0:00:04' union all
select 's1','u2','2021-01-01 0:00:05' union all
select 's1','u2','2021-01-01 0:00:06' union all
select 's1','u2','2021-01-01 0:00:07' union all
select 's1','u1','2021-01-01 0:00:08' union all
select 's1','u1','2021-01-01 0:00:09' union all
select 's1','u1','2021-01-01 0:00:10' union all
select 's1','u1','2021-01-01 0:00:11')
select session_id, user_id, min(session_time) session_start_time, max(session_time)session_end_time
from
(
select session_id, user_id, session_time,
count(distinct session_id, user_id) over (order by session_time) cnt_user_session
from mydata
)s group by session_id, user_id, cnt_user_session
order by session_start_time
Result:
session_id user_id session_start_time session_end_time
s1 u1 2021-01-01 0:00:00 2021-01-01 0:00:03
s1 u2 2021-01-01 0:00:04 2021-01-01 0:00:07
s1 u1 2021-01-01 0:00:08 2021-01-01 0:00:11
It will not wok if the same user repeats again: adding more u2 s1 records will produce the same running count and it will not work, so, the LEAD or LAG is necessary.
Using LAG we can calculate flag when session changes and use running sum of flags to calculate new_session_id (to be used for grouping):
with mydata as(
select 's1' session_id,'u1' user_id,'2021-01-01 0:00:00' session_time union all
select 's1','u1','2021-01-01 0:00:01' union all
select 's1','u1','2021-01-01 0:00:02' union all
select 's1','u1','2021-01-01 0:00:03' union all
select 's1','u2','2021-01-01 0:00:04' union all
select 's1','u2','2021-01-01 0:00:05' union all
select 's1','u2','2021-01-01 0:00:06' union all
select 's1','u2','2021-01-01 0:00:07' union all
select 's1','u1','2021-01-01 0:00:08' union all
select 's1','u1','2021-01-01 0:00:09' union all
select 's1','u1','2021-01-01 0:00:10' union all
select 's1','u1','2021-01-01 0:00:11' union all
select 's1','u2','2021-01-01 0:00:12' --added this record
)
select session_id, user_id, min(session_time) session_start_time, max(session_time)session_end_time
from
(
select session_id, user_id, session_time,
sum(changed_flag) over (order by session_time) new_session_id
from
(
select session_id, user_id, session_time,
case when concat(session_id,'_', user_id) <>
lag(concat(session_id,'_', user_id)) over (order by session_time)
then 1
else 0
end changed_flag
from mydata
)s
) s
group by new_session_id, user_id, session_id
order by session_start_time
Result:
session_id user_id session_start_time session_end_time
s1 u1 2021-01-01 0:00:00 2021-01-01 0:00:03
s1 u2 2021-01-01 0:00:04 2021-01-01 0:00:07
s1 u1 2021-01-01 0:00:08 2021-01-01 0:00:11
s1 u2 2021-01-01 0:00:12 2021-01-01 0:00:12
CodePudding user response:
Here is a way to get a grouping id out of nothing. I usually name the grouping ID obtained in this way with session_id
- but, as you have already used that column name, I name it session_key
.
Two nested OLAP queries:
The first adds a counter that is at 1 when any two columns session_id
or user_id
change over time, otherwise at 0.
Tracking changes over time require either LAG()
or LEAD()
, I'm afraid ...
The second, selecting from the first, creates a running sum of that counter, returning the session_key
.
Finally, I group by session_key
, session_id
and user_id
, and get MAX()
and MIN()
WITH
indata(session_id,user_id,session_time) AS (
SELECT 's1','u1',DATE '2021-01-01 00:00:00'
UNION ALL SELECT 's1','u1',TIMESTAMP '2021-01-01 00:00:01'
UNION ALL SELECT 's1','u1',TIMESTAMP '2021-01-01 00:00:02'
UNION ALL SELECT 's1','u1',TIMESTAMP '2021-01-01 00:00:03'
UNION ALL SELECT 's1','u2',TIMESTAMP '2021-01-01 00:00:04'
UNION ALL SELECT 's1','u2',TIMESTAMP '2021-01-01 00:00:05'
UNION ALL SELECT 's1','u2',TIMESTAMP '2021-01-01 00:00:06'
UNION ALL SELECT 's1','u2',TIMESTAMP '2021-01-01 00:00:07'
UNION ALL SELECT 's1','u1',TIMESTAMP '2021-01-01 00:00:08'
UNION ALL SELECT 's1','u1',TIMESTAMP '2021-01-01 00:00:09'
UNION ALL SELECT 's1','u1',TIMESTAMP '2021-01-01 00:00:10'
)
,
with_counter AS (
SELECT
*
, CASE WHEN (
session_id <> LAG(session_id) OVER(ORDER BY session_time)
OR user_id <> LAG(user_id ) OVER(ORDER BY session_time)
)
THEN 1
ELSE 0
END AS counter
FROM indata
)
,
with_session_key AS (
SELECT
session_id
, user_id
, session_time
, SUM(counter) OVER(ORDER BY session_time) AS session_key
FROM with_counter
)
SELECT
session_key
, session_id
, user_id
, MIN(session_time) AS session_start_time
, MAX(session_time) AS session_end_time
FROM with_session_key
GROUP BY
session_key
, session_id
, user_id
ORDER BY
session_key
, session_id
, user_id
;
-- out session_key | session_id | user_id | session_start_time | session_end_time
-- out ------------- ------------ --------- --------------------- ---------------------
-- out 0 | s1 | u1 | 2021-01-01 00:00:00 | 2021-01-01 00:00:03
-- out 1 | s1 | u2 | 2021-01-01 00:00:04 | 2021-01-01 00:00:07
-- out 2 | s1 | u1 | 2021-01-01 00:00:08 | 2021-01-01 00:00:10