How do I drop duplicates from from PostgreSQL in chunks?-CodePudding

I have a table with many millions of rows and I want to drop the duplicates. I don't want to make a second table and I don't have enough memory to do it all in one go. I tried this code but I'm not sure if it's working correctly

BEGIN;

for i in range(0, 80000000, 10000):
  DELETE FROM table_name t
  USING (SELECT MIN(id) as min_id, column1, column2, ...
         FROM table_name
         GROUP BY column1, column2, ...
         HAVING COUNT(*) > 1
         LIMIT 10000
         OFFSET i) as dups
  WHERE t.column1 = dups.column1 AND t.column2 = dups.column2 AND ...
    AND t.id > dups.min_id;

COMMIT;

CodePudding user response：

you can try below

CREATE OR REPLACE FUNCTION remove_dupes()
    RETURNS int language plpgsql AS
$$
declare
    DELETE_LIMIT int;
    total_deleted int;
    count int;
begin
    DELETE_LIMIT := 1; // make it 10,000 or more
    total_deleted := 0;
    count := 0;
    LOOP
        delete from test_delete td
            using (select bk, col1, col2, col3, col4, min(pk) minpk, count(*)
                   from test_delete
                   group by bk, col1, col2, col3, col4
                   having count(*) > 1
                   limit DELETE_LIMIT) dups
        where td.bk = dups.bk and td.col1 = dups.col1 and td.col2 = dups.col2
          and td.col3 = dups.col3 and td.col4 = dups.col4
          and td.pk != dups.minpk
        ;
        GET DIAGNOSTICS count = ROW_COUNT;
        total_deleted = total_deleted   count;
        raise notice 'deleted this loop: % ; total: %', count, total_deleted;
        exit when count = 0;
    end loop;
    return total_deleted;
end;
$$

invocation - assumed auto commit select * from remove_dupes();