Suppose you have a table with an user id and date time (for simplicity in steps of 1 hour)
The table here is ordered by agent and time stamp.
Usr Date Comment
1 2022-11-29 12:00 <- Start of a sequence
1 2022-11-29 13:00
1 2022-11-29 14:00
1 2022-11-30 12:00 <- Start of a sequence
1 2022-11-30 16:00 <- Start of a sequence
2 2022-11-29 22:00 <- Start of a sequence
2 2022-11-29 23:00
2 2022-11-30 00:00 <- Start of a sequence
2 2022-11-30 01:00
3 2022-11-29 13:00 <- Start of a sequence
3 2022-11-29 14:00
3 2022-11-30 12:00 <- Start of a sequence
3 2022-11-30 13:00
3 2022-11-30 14:00
4 2022-11-30 12:00 <- Start of a sequence
4 2022-11-30 13:00
4 2022-11-30 14:00
5 2022-11-30 16:00 <- Start of a sequence
- Expected result is the start of a sequence and its length.
- For simplicity each gap is 1 hour.
- The start of a new day (00:00) always starts a new sequence
Usr Date Length
1 2022-11-29 12:00 3
1 2022-11-30 12:00 1
1 2022-11-30 16:00 1
2 2022-11-29 22:00 2
2 2022-11-30 00:00 2
3 2022-11-29 13:00 2
3 2022-11-30 12:00 3
4 2022-11-30 12:00 3
5 2022-11-30 16:00 1
I found some code samples with dense_rank
and row_number
but didn't got a result that was expected.
I have a solution running over each record in the source table and and creating the result table, but it is slow.
The query has to run on a SQL 2012 or later.
CodePudding user response:
This problem has many solutions. It is a question of the required query performance. I would recommend to calculate the duration at the moment of recording, if there is a question of speed.
Example of a query that returns the required result:
DROP TABLE IF EXISTS dbo.test;
CREATE TABLE dbo.test
(
Usr INT, [Date] DATETIME);
DECLARE @gap INT = 1;
INSERT INTO dbo.test (Usr, [Date])
VALUES (1, '2022-11-29T12:00:00')
, (1, '2022-11-29T13:00:00')
, (1, '2022-11-29T14:00:00')
, (1, '2022-11-30T12:00:00')
, (1, '2022-11-30T16:00:00')
, (2, '2022-11-29T22:00:00')
, (2, '2022-11-29T23:00:00')
, (2, '2022-11-30T00:00:00')
, (2, '2022-11-30T01:00:00')
, (3, '2022-11-29T13:00:00')
, (3, '2022-11-29T14:00:00')
, (3, '2022-11-30T12:00:00')
, (3, '2022-11-30T13:00:00')
, (3, '2022-11-30T14:00:00')
, (4, '2022-11-30T12:00:00')
, (4, '2022-11-30T13:00:00')
, (4, '2022-11-30T14:00:00')
, (5, '2022-11-30T16:00:00');
WITH lag_cte AS
(
SELECT *
, LAG([Date], 1, [Date]) OVER (PARTITION BY Usr, CAST([Date] AS DATE) ORDER BY [Date]) lead_date --previous time by usr and date
, DATEADD(HOUR, -@gap, [Date]) gap_date --calc same group time for comprassion
, ROW_NUMBER() OVER (ORDER BY Usr, [Date]) rn --sort of identity
FROM dbo.test
)
SELECT lc.Usr
, MIN(lc.[Date]) AS [date]
, COUNT(1) AS [length]
FROM lag_cte lc
OUTER APPLY (--previous start of sequence
SELECT TOP 1 rn AS grouping_rn
FROM lag_cte li
WHERE li.Usr = lc.Usr
AND li.[Date] <= lc.[Date]
AND li.lead_date != li.gap_date --sequence staert marker
ORDER BY li.[Date] DESC
) g
GROUP BY lc.usr, CAST(lc.[Date] AS DATE), g.grouping_rn
CodePudding user response:
As @Arzanis mentioned, there are many solutions that provide the desired result. The example below should perform reasonably well with a composite primary key on Usr
and Date
.
WITH time_sequences AS (
SELECT
Usr
,Date
,LAG(Usr) OVER(PARTITION BY Usr ORDER BY Date) AS PrevUsr
,LAG(Date) OVER(PARTITION BY Usr ORDER BY Date) AS PrevDate
,LEAD(Usr) OVER(PARTITION BY Usr ORDER BY Date) AS NextUsr
,LEAD(Date) OVER(PARTITION BY Usr ORDER BY Date) AS NextDate
FROM dbo.test
)
,start_sequences AS (
SELECT
Usr
,Date
,'start' AS comment
,ROW_NUMBER() OVER(PARTITION BY Usr ORDER BY Date) AS seq
FROM time_sequences
WHERE PrevUsr IS NULL OR PrevDate <> DATEADD(hour, -1, Date) OR CAST(Date AS date) <> CAST(PrevDate AS date)
)
,end_sequences AS (
SELECT
Usr
,Date
,'end' AS comment
,ROW_NUMBER() OVER(PARTITION BY Usr ORDER BY Date) AS seq
FROM time_sequences
WHERE NextUsr IS NULL OR NextDate <> DATEADD(hour, 1, Date) OR CAST(Date AS date) <> CAST(NextDate AS date)
)
SELECT ss.Usr, ss.Date, DATEDIFF(hour, ss.Date, es.Date) 1 AS SeqLength
FROM start_sequences AS ss
JOIN end_sequences AS es ON es.Usr = ss.Usr AND es.seq = ss.seq
ORDER BY
es.Usr
, es.Date;
CodePudding user response:
your data
CREATE TABLE mytable(
Usr INTEGER NOT NULL
,Date DATEtime NOT NULL
,Comment VARCHAR(100)
);
INSERT INTO mytable(Usr,Date,Comment) VALUES
(1,'2022-11-29 12:00','<- Start of a sequence'),
(1,'2022-11-29 13:00',NULL),
(1,'2022-11-29 14:00',NULL),
(1,'2022-11-30 12:00','<- Start of a sequence'),
(1,'2022-11-30 16:00','<- Start of a sequence'),
(2,'2022-11-29 22:00','<- Start of a sequence'),
(2,'2022-11-29 23:00',NULL),
(2,'2022-11-30 00:00','<- Start of a sequence'),
(2,'2022-11-30 01:00',NULL),
(3,'2022-11-29 13:00','<- Start of a sequence'),
(3,'2022-11-29 14:00',NULL),
(3,'2022-11-30 12:00','<- Start of a sequence'),
(3,'2022-11-30 13:00',NULL),
(3,'2022-11-30 14:00',NULL),
(4,'2022-11-30 12:00','<- Start of a sequence'),
(4,'2022-11-30 13:00',NULL),
(4,'2022-11-30 14:00',NULL),
(5,'2022-11-30 16:00','<- Start of a sequence');
your query
select usr,
comment,
count(comment) Length
From (
SELECT
usr
,CASE
WHEN Comment IS NULL THEN (
SELECT TOP 1
cast(inner_table.Date as varchar(100))
FROM
mytable as inner_table
WHERE
inner_table.Usr = mytable.Usr
AND inner_table.Date < mytable.Date
AND inner_table.Comment IS NOT NULL
ORDER BY
inner_table.Date DESC
)
ELSE
Date
END as Comment
FROM
mytable) a
group by usr,comment
order by usr,comment