Home > other >  Find Duplicate Records Whose CreateDate are Off by a Time-Interval
Find Duplicate Records Whose CreateDate are Off by a Time-Interval

Time:11-02

Somehow, duplicate documents are getting created in the database. But...before I can look at HOW they are getting created...I need to understand how VAST the issue is (or isn't). Below is a screen shot of my starting-point

  • 3 Documents got created (for the same GlobalId) within milliseconds of each other
  • I want to COMPARE DocumentCreateDate's (within an INTERVAL) across a related-series of GROUPING's
  • The goal would be to get a list of DocumentId's whose interval is LESS THAN something (say 5 minutes)

enter image description here

  • I know I need to do this recursively
  • I know I am getting closer & closer

...can someone help me finish this?

MY CURRENT SQL:

;WITH CTE_All_Documents AS
(SELECT
    ROW_NUMBER() OVER(PARTITION BY EntityDefaultSearchTerm ORDER BY DocumentId, EntityDefaultSearchTerm) AS RowNumber
    , COUNT(participation.DocumentId) OVER(PARTITION BY EntityDefaultSearchTerm ORDER BY EntityDefaultSearchTerm) AS SubTotal
    , participation.DocumentId
    , participation.EntityDefaultSearchTerm
FROM [dbo].[vDocumentParticipation] participation
WHERE
    participation.EntityCategory = 'Device'
    AND participation.DocumentTypeShortName = 'SCADA'
GROUP BY
    participation.DocumentId
    , participation.EntityDefaultSearchTerm)

SELECT
    RowNumber
    , SubTotal
    , DocumentId
    , EntityDefaultSearchTerm
    , document.CreateDate
FROM CTE_All_Documents orderedDocuments 
JOIN [dbo].[Document] document ON document.Id = orderedDocuments.DocumentId
WHERE 
    SubTotal > 1
    --AND EntityDefaultSearchTerm = 'Jackdaw 2-10H'

enter image description here

TEST DATA:
In case this helps, I tried to create a quick set of test-data...

DECLARE @TestData TABLE (DocumentId INT, EntityDefaultSearchTerm VARCHAR(100), CreateDate DATETIME2(7))


INSERT INTO @TestData SELECT 5481, '093-80126', CAST('2020-11-10T07:18:03.8766667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 9487, '093-80126', CAST('2021-09-17T09:21:45.9733333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 9488, '093-80126', CAST('2021-09-17T09:48:57.8766667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5177, '094-60069', CAST('2020-10-07T12:50:09.5700000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5178, '094-60069', CAST('2020-10-07T12:50:11.2566667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5179, '094-60069', CAST('2020-10-07T12:50:12.5700000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5180, '094-60069', CAST('2020-10-07T12:50:13.5400000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5615, '095-07365', CAST('2020-11-18T10:21:44.1400000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6818, '095-07365', CAST('2021-04-22T08:58:37.6233333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 10782, '095-07365', CAST('2021-10-08T08:18:25.5766667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5769, '099-8014BEN', CAST('2020-11-30T10:51:27.0900000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 9257, '099-8014BEN', CAST('2021-09-01T10:23:39.4333333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 9270, '099-8014BEN', CAST('2021-09-01T10:52:10.2466667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7535, '107759', CAST('2021-06-21T15:36:38.5933333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7568, '107759', CAST('2021-06-25T08:01:08.1133333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7616, '107759', CAST('2021-06-30T08:01:30.5566667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6500, '90879', CAST('2021-03-14T21:03:49.0933333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 8118, '90879', CAST('2021-08-23T08:03:22.8933333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 8123, '90879', CAST('2021-08-23T14:15:22.8633333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6700, '930930018 SITTER B', CAST('2021-04-07T15:23:30.2133333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6926, '930930018 SITTER B', CAST('2021-05-03T14:41:38.4833333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6927, '930930018 SITTER B', CAST('2021-05-03T14:41:38.9700000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6431, '98120601', CAST('2021-03-04T10:37:20.6533333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6904, '98120601', CAST('2021-04-29T13:38:02.6300000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6905, '98120601', CAST('2021-04-29T13:38:03.4733333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6906, '98120601', CAST('2021-04-29T13:38:04.2333333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6182, 'ButterFinger', CAST('2021-01-26T14:27:43.6366667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6936, 'ButterFinger', CAST('2021-05-04T10:07:44.1300000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7869, 'ButterFinger', CAST('2021-07-29T11:40:09.7000000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6183, 'Butterfinger 22-1HGL', CAST('2021-01-26T14:35:34.0966667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6937, 'Butterfinger 22-1HGL', CAST('2021-05-04T10:12:42.9800000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7870, 'Butterfinger 22-1HGL', CAST('2021-07-29T11:42:40'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6117, 'Champion 334', CAST('2021-01-18T12:20:40.9833333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6118, 'Champion 334', CAST('2021-01-18T12:20:41.9166667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 10804, 'Champion 334', CAST('2021-10-11T12:21:02.7500000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 4824, 'EDITH JACKSON 2', CAST('2020-09-03T15:14:10.5133333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5601, 'EDITH JACKSON 2', CAST('2020-11-18T09:54:57.9166667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7574, 'EDITH JACKSON 2', CAST('2021-06-28T09:00:23.2033333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6350, 'Gibbs CDP', CAST('2021-02-23T08:04:00.1433333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6351, 'Gibbs CDP', CAST('2021-02-23T08:11:06.6066667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6352, 'Gibbs CDP', CAST('2021-02-23T08:16:57.6000000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5517, 'Isaacs B-191-4', CAST('2020-11-11T13:39:07.0233333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6489, 'Isaacs B-191-4', CAST('2021-03-11T11:28:55.4500000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7375, 'Isaacs B-191-4', CAST('2021-06-07T11:57:43.4733333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 11003, 'Jackdaw 2-10H', CAST('2021-10-28T08:01:30.4633333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 11004, 'Jackdaw 2-10H', CAST('2021-10-28T08:01:30.6366667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 11005, 'Jackdaw 2-10H', CAST('2021-10-28T08:01:32.9300000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 3923, 'MARSHALL DILLON 25 #1 SALES', CAST('2020-05-21T10:18:33.2733333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 3925, 'MARSHALL DILLON 25 #1 SALES', CAST('2020-05-21T10:49:02.1200000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 4298, 'MARSHALL DILLON 25 #1 SALES', CAST('2020-07-01T09:21:23.9200000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5535, 'MOORE 12-1', CAST('2020-11-12T14:22:18.1200000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6991, 'MOORE 12-1', CAST('2021-05-06T12:33:24.3533333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 10744, 'MOORE 12-1', CAST('2021-10-06T13:04:16.8833333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5605, 'NORTH PT LAVACA LOOP CG', CAST('2020-11-18T10:02:23.4900000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6923, 'NORTH PT LAVACA LOOP CG', CAST('2021-05-03T12:40:16.6800000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 8010, 'NORTH PT LAVACA LOOP CG', CAST('2021-08-09T15:24:11.8233333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 4985, 'PARSELL, WARREN B F-7', CAST('2020-09-24T14:40:53.3766667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6471, 'PARSELL, WARREN B F-7', CAST('2021-03-08T12:25:08.3933333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 8164, 'PARSELL, WARREN B F-7', CAST('2021-08-25T11:37:05.4366667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5389, 'PHP Waha Oasis', CAST('2020-10-28T08:00:46.0666667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5471, 'PHP Waha Oasis', CAST('2020-11-06T10:04:12.6900000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5508, 'PHP Waha Oasis', CAST('2020-11-11T09:15:44.1333333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7497, 'PHP Waha Oasis', CAST('2021-06-16T13:29:31.3466667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6440, 'Rigs Star West', CAST('2021-03-04T14:14:05.2033333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6505, 'Rigs Star West', CAST('2021-03-16T12:44:07.1700000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7920, 'Rigs Star West', CAST('2021-07-30T14:26:31.4533333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 5297, 'Shelton North Offload ', CAST('2020-10-15T07:54:51.3366667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6629, 'Shelton North Offload ', CAST('2021-03-31T10:34:01.4200000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6705, 'Shelton North Offload ', CAST('2021-04-08T11:13:58.7900000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6329, 'Spider Enterprice IC', CAST('2021-02-12T10:48:04.4300000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6335, 'Spider Enterprice IC', CAST('2021-02-16T12:41:03.2366667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 6484, 'Spider Enterprice IC', CAST('2021-03-10T14:43:01.0966667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 8042, 'stn-050226', CAST('2021-08-14T21:24:05.7266667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 8043, 'stn-050226', CAST('2021-08-14T21:24:09.0233333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 8044, 'stn-050226', CAST('2021-08-14T21:24:09.8833333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7394, 'stn-091-45018', CAST('2021-06-09T08:17:29.9233333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7998, 'stn-091-45018', CAST('2021-08-09T11:18:53.4900000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 8030, 'stn-091-45018', CAST('2021-08-12T10:24:27.1033333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 4370, 'stn-093-08280', CAST('2020-07-08T12:24:50.0733333'AS DATETIME2(7))
INSERT INTO @TestData SELECT 4371, 'stn-093-08280', CAST('2020-07-08T12:29:30.8100000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 4372, 'stn-093-08280', CAST('2020-07-08T12:38:41.2966667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7288, 'stn-095-70565', CAST('2021-05-28T08:46:07.9700000'AS DATETIME2(7))
INSERT INTO @TestData SELECT 7739, 'stn-095-70565', CAST('2021-07-15T13:17:59.4066667'AS DATETIME2(7))
INSERT INTO @TestData SELECT 9534, 'stn-095-70565', CAST('2021-09-20T13:57:54.3200000'AS DATETIME2(7))


SELECT * FROM @TestData

CodePudding user response:

You can use lead() and lag() to show all records that have a CreateDate within 5 minutes (or whatever interval you like) of a neighboring record:

with delta as (
    select *
          ,prevDt =  lag(CreateDate) over (partition by EntityDefaultSearchTerm order by CreateDate)
          ,nextDt = lead(CreateDate) over (partition by EntityDefaultSearchTerm order by CreateDate)
    from @TestData
)
select DocumentId, EntityDefaultSearchTerm, CreateDate
from delta
     cross apply (values (5 * 60)) span(sec)
where datediff(second,prevDt    ,CreateDate) <= span.sec
   or datediff(second,CreateDate,nextDt    ) <= span.sec
order by EntityDefaultSearchTerm,CreateDate
;

Results from your test data set:

| DocumentId | EntityDefaultSearchTerm | CreateDate                  |
 ------------ ------------------------- ----------------------------- 
|       5177 | 094-60069               | 2020-10-07 12:50:09.5700000 |
|       5178 | 094-60069               | 2020-10-07 12:50:11.2566667 |
|       5179 | 094-60069               | 2020-10-07 12:50:12.5700000 |
|       5180 | 094-60069               | 2020-10-07 12:50:13.5400000 |
|       6926 | 930930018 SITTER B      | 2021-05-03 14:41:38.4833333 |
|       6927 | 930930018 SITTER B      | 2021-05-03 14:41:38.9700000 |
|       6904 | 98120601                | 2021-04-29 13:38:02.6300000 |
|       6905 | 98120601                | 2021-04-29 13:38:03.4733333 |
|       6906 | 98120601                | 2021-04-29 13:38:04.2333333 |
|       6117 | Champion 334            | 2021-01-18 12:20:40.9833333 |
|       6118 | Champion 334            | 2021-01-18 12:20:41.9166667 |
|      11003 | Jackdaw 2-10H           | 2021-10-28 08:01:30.4633333 |
|      11004 | Jackdaw 2-10H           | 2021-10-28 08:01:30.6366667 |
|      11005 | Jackdaw 2-10H           | 2021-10-28 08:01:32.9300000 |
|      11019 | Jackdaw 2-10H           | 2021-10-28 08:06:25.6700000 | <-- the extra one
|       8042 | stn-050226              | 2021-08-14 21:24:05.7266667 |
|       8043 | stn-050226              | 2021-08-14 21:24:09.0233333 |
|       8044 | stn-050226              | 2021-08-14 21:24:09.8833333 |
|       4370 | stn-093-08280           | 2020-07-08 12:24:50.0733333 |
|       4371 | stn-093-08280           | 2020-07-08 12:29:30.8100000 |

NOTE: I threw in an extra "Jackdaw 2-10H" to illustrate that the notion of a group of records is somewhat ambiguous. With the addition of document 11019, document 11005 belongs to two groups:

  • 11003 through 11005
  • 11005 through 11019

CodePudding user response:

I hope I understood your request, and the following code helps. Try this:

DECLARE @MaxTargetOffset    int =   5 * 60 -- 5 minutes by 60 seconds. comparison will be in seconds)
;
WITH    EntityCTE1  AS
    (
            SELECT 
                    RowNumber   =   ROW_NUMBER() OVER(PARTITION BY EntityDefaultSearchTerm ORDER BY DocumentId, EntityDefaultSearchTerm)
                ,   * 
            FROM @TestData
    )
    ,   EntityCTE2  AS  
    (
            SELECT 
                    RowNumber   =   ROW_NUMBER() OVER(PARTITION BY EntityDefaultSearchTerm ORDER BY DocumentId, EntityDefaultSearchTerm)
                ,   * 
            FROM @TestData
    )
    , EntityOffset  AS
    (
        SELECT  
                DocumentId1 =   E1.DocumentId
            ,   DocumentId2 =   E2.DocumentId
            ,   EntityName  =   E1.EntityDefaultSearchTerm
            ,   CreateDate  =   E1.CreateDate
            ,   TimeOffset  =   DATEDIFF(second, E1.CreateDate, E2.CreateDate)
        FROM    EntityCTE1  E1
        JOIN    EntityCTE2  E2  ON  E2.EntityDefaultSearchTerm      =   E1.EntityDefaultSearchTerm
                                AND CONVERT(date, E2.CreateDate)    =   CONVERT(date, E1.CreateDate)
        WHERE   E1.RowNumber    <   E2.RowNumber
    )

SELECT *
FROM EntityOffset
WHERE   TimeOffset > = @MaxTargetOffset
ORDER BY EntityName, DocumentId1
  • Related