I have a table with many millions of rows and I want to drop the duplicates. I don't want to make a second table and I don't have enough memory to do it all in one go. I tried this code but I'm not sure if it's working correctly
BEGIN;
for i in range(0, 80000000, 10000):
DELETE FROM table_name t
USING (SELECT MIN(id) as min_id, column1, column2, ...
FROM table_name
GROUP BY column1, column2, ...
HAVING COUNT(*) > 1
LIMIT 10000
OFFSET i) as dups
WHERE t.column1 = dups.column1 AND t.column2 = dups.column2 AND ...
AND t.id > dups.min_id;
COMMIT;
CodePudding user response:
you can try below
CREATE OR REPLACE FUNCTION remove_dupes()
RETURNS int language plpgsql AS
$$
declare
DELETE_LIMIT int;
total_deleted int;
count int;
begin
DELETE_LIMIT := 1; // make it 10,000 or more
total_deleted := 0;
count := 0;
LOOP
delete from test_delete td
using (select bk, col1, col2, col3, col4, min(pk) minpk, count(*)
from test_delete
group by bk, col1, col2, col3, col4
having count(*) > 1
limit DELETE_LIMIT) dups
where td.bk = dups.bk and td.col1 = dups.col1 and td.col2 = dups.col2
and td.col3 = dups.col3 and td.col4 = dups.col4
and td.pk != dups.minpk
;
GET DIAGNOSTICS count = ROW_COUNT;
total_deleted = total_deleted count;
raise notice 'deleted this loop: % ; total: %', count, total_deleted;
exit when count = 0;
end loop;
return total_deleted;
end;
$$
invocation - assumed auto commit
select * from remove_dupes();