Home > Blockchain >  Group rows based on value aggregation
Group rows based on value aggregation

Time:11-10

I am trying to write a query whose purpose is to group multiple sequential rows for future processing. The rules for such grouping are:

  • Each row has a segment identifier and a corresponding weight.
  • As many contiguous segments as possible must be grouped together, provided that their total weight does not exceed the specified threshold.
  • If a segment has weight exceeding the specified threshold, the resulting group will only represent that segment.

Here is an example:

SET NOCOUNT ON;

DROP TABLE IF EXISTS #tmpIncoming;

CREATE TABLE #tmpIncoming
(
  [Segment] int NOT NULL,
  [Weight]  int NOT NULL
);

INSERT INTO #tmpIncoming VALUES
( 1,  25),
( 2,  45),
( 3,  20),
( 4,  30),
( 5,  50),
( 6,  21),
( 7, 110);

DECLARE @nMaxChunkSize int = 100;

-- BEGIN: suboptimal
DROP TABLE IF EXISTS #tmpResult;

CREATE TABLE #tmpResult
(
    [MinSegment] int NOT NULL,
    [MaxSegment] int NOT NULL,
    [Weight]   int NOT NULL
);

DECLARE cur CURSOR LOCAL READ_ONLY FORWARD_ONLY STATIC FOR
SELECT
    [Segment],
    [Weight]
FROM
    #tmpIncoming
ORDER BY
    [Segment];

OPEN cur;

DECLARE @nMinSegment int = 0, @nMaxSegment int = 0;
DECLARE @nWeightSoFar int = 0;

WHILE (1=1)
BEGIN
    DECLARE @nSegment int, @nWeight int;
    FETCH NEXT FROM cur INTO @nSegment, @nWeight;

    IF (@@FETCH_STATUS <> 0)
        BREAK;

    IF (@nWeightSoFar   @nWeight > @nMaxChunkSize)
    BEGIN
        INSERT INTO #tmpResult ([MinSegment], [MaxSegment], [Weight])
        VALUES (@nMinSegment, @nMaxSegment, @nWeightSoFar);

        SET @nMinSegment = @nSegment;
        SET @nMaxSegment = @nSegment;
        SET @nWeightSoFar = @nWeight;
    END
    ELSE
    BEGIN
        IF (@nMinSegment = 0)
            SET @nMinSegment = @nSegment;
        SET @nMaxSegment = @nSegment;
        SET @nWeightSoFar = @nWeightSoFar   @nWeight;
    END;
END;

CLOSE cur;
DEALLOCATE cur;

IF (@nWeightSoFar > 0)
    INSERT INTO #tmpResult ([MinSegment], [MaxSegment], [Weight])
    VALUES (@nMinSegment, @nMaxSegment, @nWeightSoFar);

SELECT * FROM #tmpResult;

DROP TABLE IF EXISTS #tmpResult;
-- END: suboptimal

DROP TABLE IF EXISTS #tmpIncoming;

I was only able to think of a suboptimal implementation which uses a cursor variable. Can anyone recommend a better approach, preferably with only one SELECT and maybe some CTE?

CodePudding user response:

You can use recursion to cycle through the weights in order and, as soon as you exceed your chunk size, reset.

DECLARE @nMaxChunkSize int = 100;

;WITH x AS 
(
  SELECT Segment, 
         Weight, 
         rn = ROW_NUMBER() OVER (ORDER BY Segment)
    FROM #tmpIncoming
),
cte AS 
(
  SELECT Segment, Weight, rn, total = Weight, flip = 0 
  FROM x 
  WHERE rn = 1
  
  UNION ALL

  SELECT x.Segment, x.Weight, x.rn, total = CASE 
    WHEN x.Weight   cte.Total > @nMaxChunkSize 
    THEN x.Weight ELSE x.Weight   cte.Total END,
    flip = flip   CASE 
    WHEN x.Weight   cte.Total > @nMaxChunkSize 
    THEN 1 ELSE 0 END
  FROM x JOIN cte  
  ON x.rn = cte.rn   1
)
SELECT MinSegment = MIN(Segment), 
       MaxSegment = MAX(Segment),
       Weight     = MAX(total)
FROM cte
GROUP BY flip
ORDER BY MinSegment
OPTION (MAXRECURSION 0);

Results:

MinSegment MaxSegment Weight
1 3 90
4 5 80
6 6 21
7 7 110

Another way that yields the same results but may be a little easier to break apart / follow (though arguably the code quickly becomes as verbose as the original):

DECLARE @nMaxChunkSize int = 100;

;WITH x AS 
(
  SELECT Segment, 
         Weight, 
         rn = ROW_NUMBER() OVER (ORDER BY Segment)
    FROM #tmpIncoming
),
cte AS 
(
  SELECT Segment, Weight, rn, Total = Weight
  FROM x 
  WHERE rn = 1
  
  UNION ALL

  SELECT x.Segment, x.Weight, x.rn, Total = CASE 
    WHEN x.Weight   cte.Total > @nMaxChunkSize 
    THEN x.Weight ELSE x.Weight   cte.Total END
  FROM x JOIN cte ON x.rn = cte.rn   1
)
SELECT MinSegment = MIN(Segment), 
       MaxSegment = MAX(Segment),
       Weight     = MAX(Total)
FROM 
(
  SELECT Segment, Total, 
    NewGroup = SUM(CASE WHEN Weight = Total THEN 1 ELSE 0 END) 
    OVER (ORDER BY Segment ROWS UNBOUNDED PRECEDING) FROM cte
) AS y
GROUP BY NewGroup
ORDER BY MinSegment
OPTION (MAXRECURSION 0);

CodePudding user response:

Due to the calculations, I do not think this can be done with windowed functions. I would be inclined to use a .NET DataReader either on the application server or using the CLR. If you really want to use SQL you could try the Quirky Update:

https://www.sqlservercentral.com/articles/solving-the-running-total-and-ordinal-rank-problems-rewritten

but be aware it is non-relational and could be broken by a Microsoft patch.

DROP TABLE IF EXISTS #t;

CREATE TABLE #t
(
    Segment int NOT NULL PRIMARY KEY
    ,[Weight] int NOT NULL
    ,MinSegment int NULL
    ,WeightSoFar int NULL
);
INSERT INTO #t (Segment, [Weight])
SELECT Segment, [Weight]
FROM #tmpIncoming;

DECLARE @nMaxChunkSize int = 100
    ,@WeightSoFar int = 0
    ,@Break int = 0
    ,@MinSegment int = 1
    ,@Check int
    ,@Anchor int;

UPDATE #t
SET @Break =
        CASE
            WHEN [Weight]   @WeightSoFar > @nMaxChunkSize
            THEN 1
            ELSE 0
        END
    ,@WeightSoFar =
        CASE
            WHEN @Break = 1
            THEN [Weight]
            ELSE [Weight]   @WeightSoFar
        END
    ,@MinSegment =
        CASE
            WHEN @Break = 1
            THEN Segment
            ELSE @MinSegment
        END
    ,WeightSoFar = @WeightSoFar
    ,MinSegment = @MinSegment
    -- Double check running in segment order
    ,@check = CASE WHEN Segment > ISNULL(@Anchor, -1) THEN 1 ELSE 1/0 END
    ,@Anchor = Segment
FROM #t WITH (TABLOCKX)
OPTION (MAXDOP 1);

SELECT MinSegment
    ,MAX(Segment) AS MaxSegment
    ,MAX(WeightSoFar) AS [Weight]
FROM #t
GROUP BY MinSegment;

--DROP TABLE IF EXISTS #t;
  • Related