Randomly pick N distinct winners with weights for a raffle-CodePudding

I've been trying to find a solution to this problem for a day now.

So, I have a table (raffle_tickets), from which I want to pick N distinct users, with their probability of being picked based on the sum of the number of tickets they bought, as the winners of a raffle and insert the winners into raffle_winners.

Now, I've found a solution on SO to pick 1 winner, but not N (And also it has a slight issue, where if there's, let's say, exactly 1 entry it is totally random whenever it is picked or not, which is not acceptable, obviously).

In that same answer (and others of other questions) I saw cross join being used with generate_series, but from what it looks like it would pick with replacement (e.g. with duplicates, not distinct), and that's not what I want.

I'm using Postgres/PSQL 14.5.

Here's some of the table structure:

/* Table with raffle tickets. Each  user might have multiple entries in it for the same raffle */
CREATE TABLE IF NOT EXISTS raffle_tickets (
    id                      SERIAL                  PRIMARY KEY,
    raffle_id               BIGINT                  REFERENCES raffles(id),
    user_id                 BIGINT                  NOT NULL,
    num_tickets             INT                     NOT NULL,
    date                    TIMESTAMP               NOT NULL DEFAULT NOW()
);

/* Winners of  raffles. Selected based on distinct users and weights from `raffle_tickets` */
CREATE TABLE IF NOT EXISTS raffle_winners (
    id                      SERIAL                  PRIMARY KEY,
    raffle_id               BIGINT                  REFERENCES raffles(id),
    user_id                 BIGINT                  NOT NULL,
    probability             FLOAT                   NOT NULL

    CONSTRAINT user_winner_once_per_raffle UNIQUE(raffle_id, user_id) /*  One user might not be picked more than once as a winner of a raffle */
);

/* Simplified table, in reality it has more fields  */
CREATE TABLE IF NOT EXISTS raffles (
    id              SERIAL PRIMARY KEY,
    num_max_winners INT    NOT NULL
);

The code I wrote (below) is based on this answer if anyone is interested.

WITH users_and_weights AS (
    SELECT 
        DISTINCT(user_id), 
        SUM(num_tickets) AS weight
    FROM 
        raffle_tickets
    WHERE
        raffle_id=$1
    GROUP BY
        user_id
), p AS ( /* probability */
    SELECT 
        *, 
        (weight / SUM(weight) OVER ()) AS probability
    FROM
        users_and_weights
), cp AS ( /* cumulative probability */
    SELECT 
        *, 
        SUM(p.probability) OVER (
            ORDER BY probability DESC
            ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
        ) AS cum_probability
    FROM
        p
), fp AS ( /* final probability */
    SELECT
        *,
        cum_probability - probability AS start_probability,
        cum_probability AS end_probability
    FROM 
        cp
)
INSERT INTO
    raffle_winners (user_id, raffle_id, probability)
SELECT 
    user_id,
    $1 AS raffle_id,
    probability
FROM
    fp
WHERE
    random() BETWEEN start_probability AND end_probability
LIMIT 
    (SELECT num_max_winners FROM raffle_data)

CodePudding user response：

You are making this more complicated than necessary.

This is simplified for a single raffle:

with gen_tickets as (
  -- Use `generate_series()` to create a row for each ticket
  select user_id
    from raffle_tickets
         cross join lateral generate_series(1, num_tickets)
), shuffle as (
  select user_id, row_number() over (order by random()) as rn
    from gen_tickets
), min_row as (
  -- Limit to one win per user
  select user_id, min(rn) 
    from shuffle
   group by user_id
), winner_order as (
  select user_id, row_number() over (order by rn) as rn
    from min_row
)
select *
  from winner_order
 where rn <= <num_max_winners>

CodePudding user response：

To those interested, this here's what I ended up using.

(It's not perfect, but I already spent way too much time on it, so if anyone feels like fixing it up, please do.)

(As a side-note, is there a good way to pass query parameters to a function block like this? I'm using asyncpg (python))

DO
$do$
DECLARE
    num_uniq_participants INTEGER;
    num_max_winners_to_select INTEGER;
BEGIN 
    num_max_winners_to_select := (
        SELECT 
            num_max_winners
        FROM
            raffles
        WHERE
            id={raffle_id}
    );

    num_uniq_participants := (
        SELECT 
            COUNT(*) 
        FROM (
            SELECT 
                DISTINCT(user_id) 
            FROM 
                raffle_tickets
            WHERE
                raffle_id={raffle_id}
        ) AS q
    );
    
    IF (num_max_winners_to_select >= num_uniq_participants) THEN
        /* There are less participants than the required amount of winners, so everyone is a winner */
        INSERT INTO
            raffle_winners(user_id, raffle_id, probability)
        SELECT 
            DISTINCT(user_id), 
            $1 AS raffle_id,
            1 AS probability
        FROM
            raffle_tickets
        WHERE
            raffle_id={raffle_id};
    ELSE
        /** 
        * Pick winners.
        * Each iteration the winners are excluded from the 
        * newly pickable participant list.
        **/

        /**
        * TODO: 
        * Right now this isn't super efficient, as we always re-calculate
        * the weight of each participant in each iteartion.
        * For now it's okay, but something to keep in mind in the future.
        * (Though, unless there's hunderds of thousands of participants it shouldn't be too bad)
        **/

        FOR i IN 1..LEAST(num_max_winners_to_select, num_uniq_participants) LOOP
            WITH users_and_weights AS (
                SELECT 
                    DISTINCT(user_id), 
                    SUM(num_tickets) AS weight
                FROM 
                    raffle_tickets rt
                WHERE 
                    NOT EXISTS ( /* Don't re-pick winners */
                        SELECT
                            1
                        FROM
                            raffle_winners rw
                        WHERE
                            rw.user_id=rt.user_id AND rw.raffle_id=rt.raffle_id
                    ) AND raffle_id={raffle_id}
            GROUP BY
                    user_id
            ), p AS ( /* probability */
                SELECT 
                    *, 
                    (weight / SUM(weight) OVER ()) AS probability
                FROM
                    users_and_weights
            ), cp AS ( /* cumulative probability */
                SELECT 
                    *, 
                    SUM(p.probability) OVER (
                        ORDER BY probability DESC
                        ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
                    ) AS cum_probability
                FROM
                    p
            ), fp AS ( /* final probability */
                SELECT
                    *,
                    cum_probability - probability AS start_probability,
                    cum_probability AS end_probability
                FROM 
                    cp
            ), const_rnd AS (
                /* Must put this into a CTE otherwise it's re-evaluated for 
                * each row and might cause no entry to be selected at all.
                **/
                SELECT RANDOM() AS RND 
            )

            INSERT INTO
                raffle_winners(user_id, raffle_id, probability)
            SELECT 
                user_id, 
                $1 AS raffle_id,
                probability
            FROM
                cp
            WHERE
                (SELECT rnd FROM const_rnd) BETWEEN cum_probability - probability AND cum_probability
            LIMIT
                1; /* Pick 1 winner / interation */
    END LOOP;
END IF;
END
$do$;