Home > Software design >  Select the top N exclusive values of a group in SQL?
Select the top N exclusive values of a group in SQL?

Time:12-23

Is there a way in SQL (preferably SQL Server) to select the top N records in a group that are exclusive of the other groups?

For example:

DROP TABLE IF EXISTS #DISTANCE

CREATE TABLE #DISTANCE
(
    GNAME VARCHAR(3)
,   CNAME VARCHAR(3)
,   DIST NUMERIC(5,3)
)

INSERT INTO #DISTANCE
VALUES ('E1', 'C1', 1), ('E1','C2',2),
       ('E2', 'C1', 1.5), ('E2','C2',2.5)

If I was looking for the first exclusive CNAME for each ENAME by distance ASC, I would expect an output like this:

Ename Cname Dist
E1 C1 1
E2 C2 2.5

Note that E1|C2 and E2|C1 are omitted because they would be the 2nd value in a rank result for the group.

I have come up with a few SQL methods to try and pull this correctly, but my work breaks down when I add additional groups on ENAME and if I change my Top N value.

If I add to the complexity:

TRUNCATE TABLE #DISTNACE

INSERT INTO #DISTANCE
VALUES ('E1', 'C1', 1), ('E1', 'C2', 2), 
       ('E1', 'C3', 3), ('E1', 'C4', 5),
       ('E2', 'C1', 2.5), ('E2', 'C2', 4), 
       ('E2', 'C3', 3.5), ('E2', 'C4', 6),
       ('E3', 'C4', 7), ('E3', 'C5', 6), 
       ('E3', 'C6', 4)

The SQL output I am trying to get to would look like this:

GNAME CNAME DIST
E1 C1 1.000
E1 C2 2.000
E1 C3 3.000
E2 C4 6.000
E3 C6 4.000
E3 C5 6.000

I am able to get it to work in this particular instance, with this code:

WITH X AS  
(
    SELECT * 
    --, RNK             = DENSE_RANK() OVER (ORDER BY DIST ASC) 
    , CNAME_RNK_BY_DIST = DENSE_RANK() OVER (PARTITION BY CNAME ORDER BY DIST ASC)
    , CNAME_RNK_BY_DIST = DENSE_RANK() OVER (PARTITION BY CNAME ORDER BY DIST ASC) 
    FROM #DISTANCE
    )
,MINDIST AS ( -- FIRST OCCURANCE OF CNAME VALUE
    SELECT 
        CNAME
    ,   MIN(DIST) MINDIST
    FROM X GROUP BY CNAME
)
            -- SELECT * , CALC = SUM(CNAME_RNK_BY_DIST / 4) OVER (PARTITION BY CNAME ORDER BY DIST ASC) FROM X order by CNAME, DIST
, X2 AS (

    SELECT *, CALC = SUM(FLOOR(CNAME_RNK_BY_DIST / 4)) OVER (PARTITION BY CNAME ORDER BY DIST ASC) FROM X
)
    --SELECT * FROM X2 order by CNAME, dist
, CALC AS (
    SELECT CNAME, MAXINC = MAX(CALC) FROM X2 GROUP BY CNAME
)
    --SELECT * FROM CALC
, FIRST_OCCURANCE_PAIRS AS (
    SELECT A.*
        ,OCCURANCE = RANK() OVER (PARTITION BY CNAME ORDER BY DIST)
    FROM X A
    JOIN MINDIST B ON A.CNAME = B.CNAME AND A.DIST = B.MINDIST
)
    --SELECT * FROM FIRST_OCCURANCE_PAIRS

,ISO AS 
(
    SELECT * fROM FIRST_OCCURANCE_PAIRS WHERE OCCURANCE > 3 
)
--select * from FIRST_OCCURANCE_PAIRS
--  SELECT * FROM ISO
, NEXT_OCCURANCE AS (
    SELECT A.*
    FROM  X AS A
    JOIN CALC ON A.CNAME = CALC.CNAME
    JOIN ISO B ON A.CNAME_RNK_BY_DIST  =  CALC.MAXINC               and A.CNAME = B.CNAME
)
    --select * from NEXT_OCCURANCE
, FRAME AS (

SELECT 
    CNAME
,   CNAME
,   DIST

FROM FIRST_OCCURANCE_PAIRS
--WHERE OCCURANCE <=3
UNION
SELECT
    CNAME
,   CNAME
,   DIST
FROM NEXT_OCCURANCE
)
--select * from FRAME
, FINAL AS (
    SELECT * ,FINALRNK = ROW_NUMBER() OVER (PARTITION BY CNAME ORDER BY DIST)
    FROM
    FRAME
)
    SELECT * FROM FINAL WHERE FINALRNK <4

but the logic fails as more records are added. Is there a way to clean up this SQL and achieve the results for any number of combinations?

CodePudding user response:

It seems you want the row with the least dist per cname. That is simply:

select gname, cname, dist
from
(
  select
    gname, cname, dist, rank() over (partition by cname order by dist) as rnk
  from mytable
)as r
where rnk = 1;

I am using RANK here in order to deal with ties. So, when there are two rows with the same minimum dist for a cname, you'll get those two rows for one cname. If you want to allow only one row per cname, you must use ROW_NUMBER instead of RANK, but then you must also decide which of the tying gname to display.

CodePudding user response:

After looking at my problem some more, I don't think I provided enough information to my question to be useful. After some adjustments, I think I am able to achieve what I want. Instead of being able to do this in a single query, I think I have to run a batch execution step to pass over my data to pull what I need. Perhaps there is a way for me to reduce/optimize this code below?

DROP TABLE IF EXISTS #DISTANCE
CREATE TABLE #DISTANCE
(
    GNAME       VARCHAR(3)
,   CNAME       VARCHAR(3)
,   DIST        NUMERIC(5,3)
)

DROP TABLE IF EXISTS #RESULT
CREATE TABLE #RESULT
(
    GNAME       VARCHAR(3)
,   CNAME       VARCHAR(3)
,   DIST        NUMERIC(5,3)
--, FINRNK      INT
)
--INSERT INTO #DISTANCE
--VALUES 
--  ('E1', 'C1', 1  )       
--, ('E1', 'C2', 2  )
--, ('E1', 'C3', 3  )
--, ('E1', 'C4', 5  )
--, ('E2', 'C1', 2.5)   
--, ('E2', 'C2', 4  )
--, ('E2', 'C3', 3.5)
--, ('E2', 'C4', 6  )
--, ('E3', 'C4', 7  )       
--, ('E3', 'C5', 6  )
--, ('E3', 'C6', 4  )
---- CONCLUSIONG; SCENARIO WORKS, VALUES SHIFT.


--INSERT INTO #DISTANCE
--VALUES 
--  ('E1', 'C1', 1  )       
--, ('E1', 'C2', 2  )
--, ('E1', 'C3', 3  )
--, ('E1', 'C4', 5  )
--, ('E2', 'C7', 2.5)   
--, ('E2', 'C8', 4  )
--, ('E2', 'C9', 3.5)
--, ('E2', 'C4', 6  )
--, ('E3', 'C4', 7  )       
--, ('E3', 'C5', 6  )
--, ('E3', 'C6', 4  )
----,   ('E3', 'C3', 1  )
-- CONCLUSIONG; SCENARIO WORKS, VALUES SHIF

INSERT INTO #DISTANCE
VALUES 
    ('E1', 'C1', 1  )       
,   ('E1', 'C2', 2  )
,   ('E1', 'C3', 3  )
,   ('E1', 'C4', 4  ) -- switch with E2 dist to switch groups in results
,   ('E2', 'C1', 2.5)   
,   ('E2', 'C2', 4  )
,   ('E2', 'C3', 3.5)
,   ('E2', 'C4', 6  ) -- switch dist with e1 to move groups in results
,   ('E3', 'C4', 7  )       
,   ('E3', 'C5', 6  )
,   ('E3', 'C6', 4  )
,   ('E3', 'C3', 1  )

--INSERT INTO #DISTANCE
--VALUES 
--  ('E1', 'C1', 1  )       
--, ('E1', 'C2', 2  )
--, ('E1', 'C3', 3  )
--, ('E1', 'C4', 5  )
--, ('E2', 'C7', 2.5)   -- CHANGE CVAL TO UNIQUE TO PULL THROUGH    
--, ('E2', 'C8', 4  )   -- CHANGE CVAL TO UNIQUE TO PULL THROUGH
--, ('E2', 'C11', 3.5)  -- CHANGE CVAL TO UNIQUE TO PULL THROUGH
--, ('E2', 'C4', 6  )
--, ('E3', 'C9', 3  ) -- NEW
--, ('E3', 'C4', 7  )       
--, ('E3', 'C5', 6  )
--, ('E3', 'C6', 4  )
-- SCENARIO CORRECT. C4 SHOULD NOT BE PULLED IN.
GO

BEGIN
    ;
    WITH 
     NOROOM AS (
        SELECT GNAME FROM #RESULT GROUP BY GNAME HAVING COUNT(1) >= 3 -- HAS TO BE NOROOM BECAUSE 'HASROOM' WOULD not make sense in first pass.
    )
    ,X AS (
        SELECT 
            A.CNAME
        ,   A.DIST
        ,   A.GNAME
        ,   RNK = RANK()        OVER (PARTITION BY A.GNAME ORDER BY A.DIST ASC)
        ,   RNK2 = ROW_NUMBER() OVER (ORDER BY A.DIST ASC)
        ,   #RESULT.CNAME AS RSLT
        FROM #DISTANCE A
        LEFT JOIN #RESULT ON A.CNAME = #RESULT.CNAME
        LEFT JOIN NOROOM ON A.GNAME = NOROOM.GNAME
        WHERE NOROOM.GNAME IS NULL AND #RESULT.CNAME IS NULL --ORDER BY RNK
        )
        --SELECT * FROM X
    , Y AS (
        SELECT X.CNAME, MINRNK2 = MIN(X.RNK2) 
        FROM X 
        --LEFT JOIN #RESULT B ON X.CNAME = B.CNAME
        GROUP BY  X.CNAME
    )
    --  select * from y
    --  SELECT * FROM x JOIN Y ON X.CNAME = Y.CNAME AND X.RNK2 = Y.MINRNK2
    INSERT INTO #RESULT
    SELECT X.GNAME, X.CNAME, X.DIST FROM X 
     JOIN Y AS SQ ON X.CNAME = SQ.CNAME AND X.RNK2 = SQ.MINRNK2
    WHERE X.RNK <=3 --AND   #RESULT.GNAME IS NULL

     PRINT 'LOOP'
END
GO 10
     SELECT * FROM #RESULT ORDER BY GNAME
  • Related