I've been trying to find a solution to this problem for a day now.
So, I have a table (raffle_tickets
), from which I want to pick N distinct users, with their probability of being picked based on the sum of the number of tickets they bought, as the winners of a raffle and insert the winners into raffle_winners
.
Now, I've found a solution on SO to pick 1 winner, but not N (And also it has a slight issue, where if there's, let's say, exactly 1 entry it is totally random whenever it is picked or not, which is not acceptable, obviously).
In that same answer (and others of other questions) I saw cross join
being used with generate_series
, but from what it looks like it would pick with replacement (e.g. with duplicates, not distinct), and that's not what I want.
I'm using Postgres/PSQL 14.5.
Here's some of the table structure:
/* Table with raffle tickets. Each user might have multiple entries in it for the same raffle */
CREATE TABLE IF NOT EXISTS raffle_tickets (
id SERIAL PRIMARY KEY,
raffle_id BIGINT REFERENCES raffles(id),
user_id BIGINT NOT NULL,
num_tickets INT NOT NULL,
date TIMESTAMP NOT NULL DEFAULT NOW()
);
/* Winners of raffles. Selected based on distinct users and weights from `raffle_tickets` */
CREATE TABLE IF NOT EXISTS raffle_winners (
id SERIAL PRIMARY KEY,
raffle_id BIGINT REFERENCES raffles(id),
user_id BIGINT NOT NULL,
probability FLOAT NOT NULL
CONSTRAINT user_winner_once_per_raffle UNIQUE(raffle_id, user_id) /* One user might not be picked more than once as a winner of a raffle */
);
/* Simplified table, in reality it has more fields */
CREATE TABLE IF NOT EXISTS raffles (
id SERIAL PRIMARY KEY,
num_max_winners INT NOT NULL
);
The code I wrote (below) is based on this answer if anyone is interested.
WITH users_and_weights AS (
SELECT
DISTINCT(user_id),
SUM(num_tickets) AS weight
FROM
raffle_tickets
WHERE
raffle_id=$1
GROUP BY
user_id
), p AS ( /* probability */
SELECT
*,
(weight / SUM(weight) OVER ()) AS probability
FROM
users_and_weights
), cp AS ( /* cumulative probability */
SELECT
*,
SUM(p.probability) OVER (
ORDER BY probability DESC
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS cum_probability
FROM
p
), fp AS ( /* final probability */
SELECT
*,
cum_probability - probability AS start_probability,
cum_probability AS end_probability
FROM
cp
)
INSERT INTO
raffle_winners (user_id, raffle_id, probability)
SELECT
user_id,
$1 AS raffle_id,
probability
FROM
fp
WHERE
random() BETWEEN start_probability AND end_probability
LIMIT
(SELECT num_max_winners FROM raffle_data)
CodePudding user response:
You are making this more complicated than necessary.
This is simplified for a single raffle:
with gen_tickets as (
-- Use `generate_series()` to create a row for each ticket
select user_id
from raffle_tickets
cross join lateral generate_series(1, num_tickets)
), shuffle as (
select user_id, row_number() over (order by random()) as rn
from gen_tickets
), min_row as (
-- Limit to one win per user
select user_id, min(rn)
from shuffle
group by user_id
), winner_order as (
select user_id, row_number() over (order by rn) as rn
from min_row
)
select *
from winner_order
where rn <= <num_max_winners>
CodePudding user response:
To those interested, this here's what I ended up using.
(It's not perfect, but I already spent way too much time on it, so if anyone feels like fixing it up, please do.)
(As a side-note, is there a good way to pass query parameters to a function block like this? I'm using asyncpg
(python))
DO
$do$
DECLARE
num_uniq_participants INTEGER;
num_max_winners_to_select INTEGER;
BEGIN
num_max_winners_to_select := (
SELECT
num_max_winners
FROM
raffles
WHERE
id={raffle_id}
);
num_uniq_participants := (
SELECT
COUNT(*)
FROM (
SELECT
DISTINCT(user_id)
FROM
raffle_tickets
WHERE
raffle_id={raffle_id}
) AS q
);
IF (num_max_winners_to_select >= num_uniq_participants) THEN
/* There are less participants than the required amount of winners, so everyone is a winner */
INSERT INTO
raffle_winners(user_id, raffle_id, probability)
SELECT
DISTINCT(user_id),
$1 AS raffle_id,
1 AS probability
FROM
raffle_tickets
WHERE
raffle_id={raffle_id};
ELSE
/**
* Pick winners.
* Each iteration the winners are excluded from the
* newly pickable participant list.
**/
/**
* TODO:
* Right now this isn't super efficient, as we always re-calculate
* the weight of each participant in each iteartion.
* For now it's okay, but something to keep in mind in the future.
* (Though, unless there's hunderds of thousands of participants it shouldn't be too bad)
**/
FOR i IN 1..LEAST(num_max_winners_to_select, num_uniq_participants) LOOP
WITH users_and_weights AS (
SELECT
DISTINCT(user_id),
SUM(num_tickets) AS weight
FROM
raffle_tickets rt
WHERE
NOT EXISTS ( /* Don't re-pick winners */
SELECT
1
FROM
raffle_winners rw
WHERE
rw.user_id=rt.user_id AND rw.raffle_id=rt.raffle_id
) AND raffle_id={raffle_id}
GROUP BY
user_id
), p AS ( /* probability */
SELECT
*,
(weight / SUM(weight) OVER ()) AS probability
FROM
users_and_weights
), cp AS ( /* cumulative probability */
SELECT
*,
SUM(p.probability) OVER (
ORDER BY probability DESC
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS cum_probability
FROM
p
), fp AS ( /* final probability */
SELECT
*,
cum_probability - probability AS start_probability,
cum_probability AS end_probability
FROM
cp
), const_rnd AS (
/* Must put this into a CTE otherwise it's re-evaluated for
* each row and might cause no entry to be selected at all.
**/
SELECT RANDOM() AS RND
)
INSERT INTO
raffle_winners(user_id, raffle_id, probability)
SELECT
user_id,
$1 AS raffle_id,
probability
FROM
cp
WHERE
(SELECT rnd FROM const_rnd) BETWEEN cum_probability - probability AND cum_probability
LIMIT
1; /* Pick 1 winner / interation */
END LOOP;
END IF;
END
$do$;