I am trying to write a query whose purpose is to group multiple sequential rows for future processing. The rules for such grouping are:
- Each row has a segment identifier and a corresponding weight.
- As many contiguous segments as possible must be grouped together, provided that their total weight does not exceed the specified threshold.
- If a segment has weight exceeding the specified threshold, the resulting group will only represent that segment.
Here is an example:
SET NOCOUNT ON;
DROP TABLE IF EXISTS #tmpIncoming;
CREATE TABLE #tmpIncoming
(
[Segment] int NOT NULL,
[Weight] int NOT NULL
);
INSERT INTO #tmpIncoming VALUES
( 1, 25),
( 2, 45),
( 3, 20),
( 4, 30),
( 5, 50),
( 6, 21),
( 7, 110);
DECLARE @nMaxChunkSize int = 100;
-- BEGIN: suboptimal
DROP TABLE IF EXISTS #tmpResult;
CREATE TABLE #tmpResult
(
[MinSegment] int NOT NULL,
[MaxSegment] int NOT NULL,
[Weight] int NOT NULL
);
DECLARE cur CURSOR LOCAL READ_ONLY FORWARD_ONLY STATIC FOR
SELECT
[Segment],
[Weight]
FROM
#tmpIncoming
ORDER BY
[Segment];
OPEN cur;
DECLARE @nMinSegment int = 0, @nMaxSegment int = 0;
DECLARE @nWeightSoFar int = 0;
WHILE (1=1)
BEGIN
DECLARE @nSegment int, @nWeight int;
FETCH NEXT FROM cur INTO @nSegment, @nWeight;
IF (@@FETCH_STATUS <> 0)
BREAK;
IF (@nWeightSoFar @nWeight > @nMaxChunkSize)
BEGIN
INSERT INTO #tmpResult ([MinSegment], [MaxSegment], [Weight])
VALUES (@nMinSegment, @nMaxSegment, @nWeightSoFar);
SET @nMinSegment = @nSegment;
SET @nMaxSegment = @nSegment;
SET @nWeightSoFar = @nWeight;
END
ELSE
BEGIN
IF (@nMinSegment = 0)
SET @nMinSegment = @nSegment;
SET @nMaxSegment = @nSegment;
SET @nWeightSoFar = @nWeightSoFar @nWeight;
END;
END;
CLOSE cur;
DEALLOCATE cur;
IF (@nWeightSoFar > 0)
INSERT INTO #tmpResult ([MinSegment], [MaxSegment], [Weight])
VALUES (@nMinSegment, @nMaxSegment, @nWeightSoFar);
SELECT * FROM #tmpResult;
DROP TABLE IF EXISTS #tmpResult;
-- END: suboptimal
DROP TABLE IF EXISTS #tmpIncoming;
I was only able to think of a suboptimal implementation which uses a cursor variable. Can anyone recommend a better approach, preferably with only one SELECT and maybe some CTE?
CodePudding user response:
You can use recursion to cycle through the weights in order and, as soon as you exceed your chunk size, reset.
DECLARE @nMaxChunkSize int = 100;
;WITH x AS
(
SELECT Segment,
Weight,
rn = ROW_NUMBER() OVER (ORDER BY Segment)
FROM #tmpIncoming
),
cte AS
(
SELECT Segment, Weight, rn, total = Weight, flip = 0
FROM x
WHERE rn = 1
UNION ALL
SELECT x.Segment, x.Weight, x.rn, total = CASE
WHEN x.Weight cte.Total > @nMaxChunkSize
THEN x.Weight ELSE x.Weight cte.Total END,
flip = flip CASE
WHEN x.Weight cte.Total > @nMaxChunkSize
THEN 1 ELSE 0 END
FROM x JOIN cte
ON x.rn = cte.rn 1
)
SELECT MinSegment = MIN(Segment),
MaxSegment = MAX(Segment),
Weight = MAX(total)
FROM cte
GROUP BY flip
ORDER BY MinSegment
OPTION (MAXRECURSION 0);
Results:
MinSegment | MaxSegment | Weight |
---|---|---|
1 | 3 | 90 |
4 | 5 | 80 |
6 | 6 | 21 |
7 | 7 | 110 |
- Example db<>fiddle
Another way that yields the same results but may be a little easier to break apart / follow (though arguably the code quickly becomes as verbose as the original):
DECLARE @nMaxChunkSize int = 100;
;WITH x AS
(
SELECT Segment,
Weight,
rn = ROW_NUMBER() OVER (ORDER BY Segment)
FROM #tmpIncoming
),
cte AS
(
SELECT Segment, Weight, rn, Total = Weight
FROM x
WHERE rn = 1
UNION ALL
SELECT x.Segment, x.Weight, x.rn, Total = CASE
WHEN x.Weight cte.Total > @nMaxChunkSize
THEN x.Weight ELSE x.Weight cte.Total END
FROM x JOIN cte ON x.rn = cte.rn 1
)
SELECT MinSegment = MIN(Segment),
MaxSegment = MAX(Segment),
Weight = MAX(Total)
FROM
(
SELECT Segment, Total,
NewGroup = SUM(CASE WHEN Weight = Total THEN 1 ELSE 0 END)
OVER (ORDER BY Segment ROWS UNBOUNDED PRECEDING) FROM cte
) AS y
GROUP BY NewGroup
ORDER BY MinSegment
OPTION (MAXRECURSION 0);
- That fiddle is here: db<>fiddle
CodePudding user response:
Due to the calculations, I do not think this can be done with windowed functions. I would be inclined to use a .NET DataReader either on the application server or using the CLR. If you really want to use SQL you could try the Quirky Update:
but be aware it is non-relational and could be broken by a Microsoft patch.
DROP TABLE IF EXISTS #t;
CREATE TABLE #t
(
Segment int NOT NULL PRIMARY KEY
,[Weight] int NOT NULL
,MinSegment int NULL
,WeightSoFar int NULL
);
INSERT INTO #t (Segment, [Weight])
SELECT Segment, [Weight]
FROM #tmpIncoming;
DECLARE @nMaxChunkSize int = 100
,@WeightSoFar int = 0
,@Break int = 0
,@MinSegment int = 1
,@Check int
,@Anchor int;
UPDATE #t
SET @Break =
CASE
WHEN [Weight] @WeightSoFar > @nMaxChunkSize
THEN 1
ELSE 0
END
,@WeightSoFar =
CASE
WHEN @Break = 1
THEN [Weight]
ELSE [Weight] @WeightSoFar
END
,@MinSegment =
CASE
WHEN @Break = 1
THEN Segment
ELSE @MinSegment
END
,WeightSoFar = @WeightSoFar
,MinSegment = @MinSegment
-- Double check running in segment order
,@check = CASE WHEN Segment > ISNULL(@Anchor, -1) THEN 1 ELSE 1/0 END
,@Anchor = Segment
FROM #t WITH (TABLOCKX)
OPTION (MAXDOP 1);
SELECT MinSegment
,MAX(Segment) AS MaxSegment
,MAX(WeightSoFar) AS [Weight]
FROM #t
GROUP BY MinSegment;
--DROP TABLE IF EXISTS #t;