Home > database >  SQL Dynamically update duplicate row values to be unique
SQL Dynamically update duplicate row values to be unique

Time:10-22

The Problem

I need to update a table so that any duplicate rows are updated to have unique values.

The Catch

I need to dynamically ensure that the value I am updating the duplicate row to is also unique.

My Solution So Far (with test case)

CREATE TABLE #temp (name nvarchar(100), ID uniqueidentifier)

INSERT INTO #temp (Name, ID)
    VALUES ('Duplicate', '32208C09-C0C3-408C-AB60-273811722194')
INSERT INTO #temp (Name, ID)
    VALUES ('Duplicate', '32208C09-C0C3-408C-AB60-273811722194')
INSERT INTO #temp (Name, ID)
    VALUES ('Duplicate (2)', '32208C09-C0C3-408C-AB60-273811722194')

;WITH cte AS (
    SELECT Name
        , ROW_NUMBER() OVER (PARTITION BY Name, ID ORDER BY Name) RowNum
    FROM #temp
)

UPDATE cte
SET Name = CONCAT(Name, ' (', RowNum, ')')
WHERE RowNum > 1

SELECT * FROM #temp

DROP TABLE #temp

As you can tell, this will update the table so there is only one row with the name 'Duplicate' but two rows with the name 'Duplicate (2)'. How can I check and account for duplicates in the value I am updating to?

CodePudding user response:

You could use another CTe which gets you the highest Number and then use that to generate the "next" number.

for 2 or more digits you need to adapt it

CREATE TABLE #temp (name nvarchar(100), AssetMakeID uniqueidentifier)

INSERT INTO #temp (Name, AssetMakeID)
    VALUES ('Duplicate', '32208C09-C0C3-408C-AB60-273811722194')
INSERT INTO #temp (Name, AssetMakeID)
    VALUES ('Duplicate', '32208C09-C0C3-408C-AB60-273811722194')
INSERT INTO #temp (Name, AssetMakeID)
    VALUES ('Duplicate (2)', '32208C09-C0C3-408C-AB60-273811722194')

;WITH CTE1 AS (SELECT
  MAX(      COALESCE(REPLACE(REPLACE(SUBSTRING(name, PATINDEX('%([0-9])%', name), PATINDEX('%)%', name   't') - PATINDEX('%(%', 
                    name)   1),'(','') ,')','') ,0)) hinum
,SUBSTRING(name,1, PATINDEX('% ([0-9])%', name) ) name
FROM #temp
  WHERE SUBSTRING(name,1, PATINDEX('% ([0-9])%', name) )  IS NOT NULL
GROUP BY SUBSTRING(name,1, PATINDEX('% ([0-9])%', name) ) ),
  cte AS (
    SELECT #temp.Name
        , CASE WHEN ROW_NUMBER() OVER (PARTITION BY #temp.Name, AssetMakeID ORDER BY #temp.Name) > 1 THEN
 ROW_NUMBER() OVER (PARTITION BY #temp.Name, AssetMakeID ORDER BY #temp.Name)   hinum -1 
  ELSe ROW_NUMBER() OVER (PARTITION BY #temp.Name, AssetMakeID ORDER BY #temp.Name) END RowNum
    FROM #temp LEFT JOIN CTE1 ON #temp.name = CTE1.name
)

UPDATE cte
SET Name = CONCAT(Name, ' (', RowNum, ')')
WHERE RowNum > 1 


SELECT * FROM #temp

name AssetMakeID
Duplicate 32208c09-c0c3-408c-ab60-273811722194
Duplicate (3) 32208c09-c0c3-408c-ab60-273811722194
Duplicate (2) 32208c09-c0c3-408c-ab60-273811722194
 

fiddle

CodePudding user response:

Well the easy way is to use a unique string in the update so there is no way your update can cause a duplicate. The current timestamp (with milliseconds) works well. Like this:

UPDATE cte
  SET Name = CONCAT(Name, ' (', RowNum, ') at ',convert(varchar(22),getdate(),126))
WHERE RowNum > 1

CodePudding user response:

This will cope with one level of duplication e.g. 'Duplicate (2)' but not two e.g. 'Duplicate (2) (2)'.

Essentially just apply the same logic again in a second cte. In fact you should be able to do this using a recursive CTE to get it to work for all levels.

That said you could use a more unique method of de-duplicating names e.g. just add a guid and it will be unique.

WITH cte1 AS (
    SELECT Name, Id
        , ROW_NUMBER() OVER (PARTITION BY Name, ID ORDER BY Name) RowNum
        -- You should already have one, but if not generate it
        , ROW_NUMBER() OVER (ORDER BY Name) UniqueId
    FROM #temp
), cte2 as (
    SELECT NewName Name, RowNum, UniqueId
        , ROW_NUMBER() OVER (PARTITION BY NewName, ID ORDER BY NewName) RowNum2
    FROM cte1
    CROSS APPLY (
        VALUES (CASE WHEN RowNum = 1 THEN Name ELSE CONCAT(Name, ' (', RowNum, ')') END)  
    ) n (NewName)
)
UPDATE c1 SET
    Name = CASE WHEN RowNum2 = 1 THEN c2.Name ELSE CONCAT(c2.Name, ' (', RowNum2, ')') END
FROM cte1 c1
INNER JOIN cte2 c2 on c2.UniqueId = c1.UniqueId
WHERE c1.RowNum > 1 or RowNum2 > 1;

CodePudding user response:

I am going to choose another answer as the correct answer since I personally prefer it, but I thought I'd post what I ended up doing myself.

DROP TABLE IF EXISTS #temp

CREATE TABLE #temp (name nvarchar(100), ID uniqueidentifier)

INSERT INTO #temp (Name, ID)
    VALUES ('Duplicate', '32208C09-C0C3-408C-AB60-273811722194')
INSERT INTO #temp (Name, ID)
    VALUES ('Duplicate', '32208C09-C0C3-408C-AB60-273811722194')
INSERT INTO #temp (Name, ID)
    VALUES ('Duplicate (2)', '32208C09-C0C3-408C-AB60-273811722194')

DECLARE @doWhileTrueFlag bit = 1
WHILE (@doWhileTrueFlag = 1)
BEGIN
    ;WITH cte AS (
        SELECT 
            Name, 
            ROW_NUMBER() OVER (PARTITION BY Name, ID ORDER BY Name) RowNum
        FROM #temp
    )

    UPDATE cte
        SET Name = CONCAT(Name, ' (', RowNum, ')')
        WHERE RowNum > 1

    SET @doWhileTrueFlag = CASE
        WHEN @@ROWCOUNT > 0 THEN 1
        ELSE 0
    END
END

SELECT * FROM #temp

DROP TABLE #temp

This performs the update I was already doing in a loop until no more updates are done. A rather inelegant solution, but the names created are prettier for the clients.

  • Related