Somehow, duplicate documents are getting created in the database. But...before I can look at HOW they are getting created...I need to understand how VAST the issue is (or isn't). Below is a screen shot of my starting-point
- 3 Documents got created (for the same GlobalId) within milliseconds of each other
- I want to COMPARE DocumentCreateDate's (within an INTERVAL) across a related-series of GROUPING's
- The goal would be to get a list of DocumentId's whose interval is LESS THAN something (say 5 minutes)
- I know I need to do this recursively
- I know I am getting closer & closer
...can someone help me finish this?
MY CURRENT SQL:
;WITH CTE_All_Documents AS
(SELECT
ROW_NUMBER() OVER(PARTITION BY EntityDefaultSearchTerm ORDER BY DocumentId, EntityDefaultSearchTerm) AS RowNumber
, COUNT(participation.DocumentId) OVER(PARTITION BY EntityDefaultSearchTerm ORDER BY EntityDefaultSearchTerm) AS SubTotal
, participation.DocumentId
, participation.EntityDefaultSearchTerm
FROM [dbo].[vDocumentParticipation] participation
WHERE
participation.EntityCategory = 'Device'
AND participation.DocumentTypeShortName = 'SCADA'
GROUP BY
participation.DocumentId
, participation.EntityDefaultSearchTerm)
SELECT
RowNumber
, SubTotal
, DocumentId
, EntityDefaultSearchTerm
, document.CreateDate
FROM CTE_All_Documents orderedDocuments
JOIN [dbo].[Document] document ON document.Id = orderedDocuments.DocumentId
WHERE
SubTotal > 1
--AND EntityDefaultSearchTerm = 'Jackdaw 2-10H'
TEST DATA:
In case this helps, I tried to create a quick set of test-data...
DECLARE @TestData TABLE (DocumentId INT, EntityDefaultSearchTerm VARCHAR(100), CreateDate DATETIME2(7))
INSERT INTO @TestData SELECT 5481, '093-80126', CAST('2020-11-10T07:18:03.8766667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 9487, '093-80126', CAST('2021-09-17T09:21:45.9733333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 9488, '093-80126', CAST('2021-09-17T09:48:57.8766667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5177, '094-60069', CAST('2020-10-07T12:50:09.5700000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5178, '094-60069', CAST('2020-10-07T12:50:11.2566667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5179, '094-60069', CAST('2020-10-07T12:50:12.5700000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5180, '094-60069', CAST('2020-10-07T12:50:13.5400000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5615, '095-07365', CAST('2020-11-18T10:21:44.1400000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6818, '095-07365', CAST('2021-04-22T08:58:37.6233333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 10782, '095-07365', CAST('2021-10-08T08:18:25.5766667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5769, '099-8014BEN', CAST('2020-11-30T10:51:27.0900000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 9257, '099-8014BEN', CAST('2021-09-01T10:23:39.4333333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 9270, '099-8014BEN', CAST('2021-09-01T10:52:10.2466667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7535, '107759', CAST('2021-06-21T15:36:38.5933333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7568, '107759', CAST('2021-06-25T08:01:08.1133333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7616, '107759', CAST('2021-06-30T08:01:30.5566667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6500, '90879', CAST('2021-03-14T21:03:49.0933333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 8118, '90879', CAST('2021-08-23T08:03:22.8933333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 8123, '90879', CAST('2021-08-23T14:15:22.8633333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6700, '930930018 SITTER B', CAST('2021-04-07T15:23:30.2133333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6926, '930930018 SITTER B', CAST('2021-05-03T14:41:38.4833333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6927, '930930018 SITTER B', CAST('2021-05-03T14:41:38.9700000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6431, '98120601', CAST('2021-03-04T10:37:20.6533333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6904, '98120601', CAST('2021-04-29T13:38:02.6300000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6905, '98120601', CAST('2021-04-29T13:38:03.4733333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6906, '98120601', CAST('2021-04-29T13:38:04.2333333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6182, 'ButterFinger', CAST('2021-01-26T14:27:43.6366667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6936, 'ButterFinger', CAST('2021-05-04T10:07:44.1300000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7869, 'ButterFinger', CAST('2021-07-29T11:40:09.7000000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6183, 'Butterfinger 22-1HGL', CAST('2021-01-26T14:35:34.0966667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6937, 'Butterfinger 22-1HGL', CAST('2021-05-04T10:12:42.9800000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7870, 'Butterfinger 22-1HGL', CAST('2021-07-29T11:42:40'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6117, 'Champion 334', CAST('2021-01-18T12:20:40.9833333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6118, 'Champion 334', CAST('2021-01-18T12:20:41.9166667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 10804, 'Champion 334', CAST('2021-10-11T12:21:02.7500000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 4824, 'EDITH JACKSON 2', CAST('2020-09-03T15:14:10.5133333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5601, 'EDITH JACKSON 2', CAST('2020-11-18T09:54:57.9166667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7574, 'EDITH JACKSON 2', CAST('2021-06-28T09:00:23.2033333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6350, 'Gibbs CDP', CAST('2021-02-23T08:04:00.1433333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6351, 'Gibbs CDP', CAST('2021-02-23T08:11:06.6066667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6352, 'Gibbs CDP', CAST('2021-02-23T08:16:57.6000000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5517, 'Isaacs B-191-4', CAST('2020-11-11T13:39:07.0233333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6489, 'Isaacs B-191-4', CAST('2021-03-11T11:28:55.4500000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7375, 'Isaacs B-191-4', CAST('2021-06-07T11:57:43.4733333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 11003, 'Jackdaw 2-10H', CAST('2021-10-28T08:01:30.4633333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 11004, 'Jackdaw 2-10H', CAST('2021-10-28T08:01:30.6366667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 11005, 'Jackdaw 2-10H', CAST('2021-10-28T08:01:32.9300000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 3923, 'MARSHALL DILLON 25 #1 SALES', CAST('2020-05-21T10:18:33.2733333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 3925, 'MARSHALL DILLON 25 #1 SALES', CAST('2020-05-21T10:49:02.1200000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 4298, 'MARSHALL DILLON 25 #1 SALES', CAST('2020-07-01T09:21:23.9200000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5535, 'MOORE 12-1', CAST('2020-11-12T14:22:18.1200000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6991, 'MOORE 12-1', CAST('2021-05-06T12:33:24.3533333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 10744, 'MOORE 12-1', CAST('2021-10-06T13:04:16.8833333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5605, 'NORTH PT LAVACA LOOP CG', CAST('2020-11-18T10:02:23.4900000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6923, 'NORTH PT LAVACA LOOP CG', CAST('2021-05-03T12:40:16.6800000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 8010, 'NORTH PT LAVACA LOOP CG', CAST('2021-08-09T15:24:11.8233333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 4985, 'PARSELL, WARREN B F-7', CAST('2020-09-24T14:40:53.3766667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6471, 'PARSELL, WARREN B F-7', CAST('2021-03-08T12:25:08.3933333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 8164, 'PARSELL, WARREN B F-7', CAST('2021-08-25T11:37:05.4366667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5389, 'PHP Waha Oasis', CAST('2020-10-28T08:00:46.0666667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5471, 'PHP Waha Oasis', CAST('2020-11-06T10:04:12.6900000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5508, 'PHP Waha Oasis', CAST('2020-11-11T09:15:44.1333333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7497, 'PHP Waha Oasis', CAST('2021-06-16T13:29:31.3466667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6440, 'Rigs Star West', CAST('2021-03-04T14:14:05.2033333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6505, 'Rigs Star West', CAST('2021-03-16T12:44:07.1700000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7920, 'Rigs Star West', CAST('2021-07-30T14:26:31.4533333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5297, 'Shelton North Offload ', CAST('2020-10-15T07:54:51.3366667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6629, 'Shelton North Offload ', CAST('2021-03-31T10:34:01.4200000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6705, 'Shelton North Offload ', CAST('2021-04-08T11:13:58.7900000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6329, 'Spider Enterprice IC', CAST('2021-02-12T10:48:04.4300000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6335, 'Spider Enterprice IC', CAST('2021-02-16T12:41:03.2366667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6484, 'Spider Enterprice IC', CAST('2021-03-10T14:43:01.0966667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 8042, 'stn-050226', CAST('2021-08-14T21:24:05.7266667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 8043, 'stn-050226', CAST('2021-08-14T21:24:09.0233333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 8044, 'stn-050226', CAST('2021-08-14T21:24:09.8833333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7394, 'stn-091-45018', CAST('2021-06-09T08:17:29.9233333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7998, 'stn-091-45018', CAST('2021-08-09T11:18:53.4900000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 8030, 'stn-091-45018', CAST('2021-08-12T10:24:27.1033333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 4370, 'stn-093-08280', CAST('2020-07-08T12:24:50.0733333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 4371, 'stn-093-08280', CAST('2020-07-08T12:29:30.8100000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 4372, 'stn-093-08280', CAST('2020-07-08T12:38:41.2966667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7288, 'stn-095-70565', CAST('2021-05-28T08:46:07.9700000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7739, 'stn-095-70565', CAST('2021-07-15T13:17:59.4066667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 9534, 'stn-095-70565', CAST('2021-09-20T13:57:54.3200000'AS DATETIME2(7))
SELECT * FROM @TestData
CodePudding user response:
You can use lead()
and lag()
to show all records that have a CreateDate
within 5 minutes (or whatever interval you like) of a neighboring record:
with delta as (
select *
,prevDt = lag(CreateDate) over (partition by EntityDefaultSearchTerm order by CreateDate)
,nextDt = lead(CreateDate) over (partition by EntityDefaultSearchTerm order by CreateDate)
from @TestData
)
select DocumentId, EntityDefaultSearchTerm, CreateDate
from delta
cross apply (values (5 * 60)) span(sec)
where datediff(second,prevDt ,CreateDate) <= span.sec
or datediff(second,CreateDate,nextDt ) <= span.sec
order by EntityDefaultSearchTerm,CreateDate
;
Results from your test data set:
| DocumentId | EntityDefaultSearchTerm | CreateDate |
------------ ------------------------- -----------------------------
| 5177 | 094-60069 | 2020-10-07 12:50:09.5700000 |
| 5178 | 094-60069 | 2020-10-07 12:50:11.2566667 |
| 5179 | 094-60069 | 2020-10-07 12:50:12.5700000 |
| 5180 | 094-60069 | 2020-10-07 12:50:13.5400000 |
| 6926 | 930930018 SITTER B | 2021-05-03 14:41:38.4833333 |
| 6927 | 930930018 SITTER B | 2021-05-03 14:41:38.9700000 |
| 6904 | 98120601 | 2021-04-29 13:38:02.6300000 |
| 6905 | 98120601 | 2021-04-29 13:38:03.4733333 |
| 6906 | 98120601 | 2021-04-29 13:38:04.2333333 |
| 6117 | Champion 334 | 2021-01-18 12:20:40.9833333 |
| 6118 | Champion 334 | 2021-01-18 12:20:41.9166667 |
| 11003 | Jackdaw 2-10H | 2021-10-28 08:01:30.4633333 |
| 11004 | Jackdaw 2-10H | 2021-10-28 08:01:30.6366667 |
| 11005 | Jackdaw 2-10H | 2021-10-28 08:01:32.9300000 |
| 11019 | Jackdaw 2-10H | 2021-10-28 08:06:25.6700000 | <-- the extra one
| 8042 | stn-050226 | 2021-08-14 21:24:05.7266667 |
| 8043 | stn-050226 | 2021-08-14 21:24:09.0233333 |
| 8044 | stn-050226 | 2021-08-14 21:24:09.8833333 |
| 4370 | stn-093-08280 | 2020-07-08 12:24:50.0733333 |
| 4371 | stn-093-08280 | 2020-07-08 12:29:30.8100000 |
NOTE: I threw in an extra "Jackdaw 2-10H" to illustrate that the notion of a group of records is somewhat ambiguous. With the addition of document 11019, document 11005 belongs to two groups:
- 11003 through 11005
- 11005 through 11019
CodePudding user response:
I hope I understood your request, and the following code helps. Try this:
DECLARE @MaxTargetOffset int = 5 * 60 -- 5 minutes by 60 seconds. comparison will be in seconds)
;
WITH EntityCTE1 AS
(
SELECT
RowNumber = ROW_NUMBER() OVER(PARTITION BY EntityDefaultSearchTerm ORDER BY DocumentId, EntityDefaultSearchTerm)
, *
FROM @TestData
)
, EntityCTE2 AS
(
SELECT
RowNumber = ROW_NUMBER() OVER(PARTITION BY EntityDefaultSearchTerm ORDER BY DocumentId, EntityDefaultSearchTerm)
, *
FROM @TestData
)
, EntityOffset AS
(
SELECT
DocumentId1 = E1.DocumentId
, DocumentId2 = E2.DocumentId
, EntityName = E1.EntityDefaultSearchTerm
, CreateDate = E1.CreateDate
, TimeOffset = DATEDIFF(second, E1.CreateDate, E2.CreateDate)
FROM EntityCTE1 E1
JOIN EntityCTE2 E2 ON E2.EntityDefaultSearchTerm = E1.EntityDefaultSearchTerm
AND CONVERT(date, E2.CreateDate) = CONVERT(date, E1.CreateDate)
WHERE E1.RowNumber < E2.RowNumber
)
SELECT *
FROM EntityOffset
WHERE TimeOffset > = @MaxTargetOffset
ORDER BY EntityName, DocumentId1