Oracle数据库大批量删除无主键约束重复数据的方案

前言:这篇文章对对于原表数据量大,分次删除无主键重复数据的步骤进行梳理归纳。

1.需求详细说明

2.解决方案

2.1确定无重复数据的记录数

2.2整理要筛选的数据

2.2确定要删除的数据

2.3运用分批去提交删除重复数据

2.4确定无重复数据

1.需求详细说明

TEST表情况说明:

按月进行分区的分区表

未定义主键约束或唯一索引

包含COL1,COL2,COL3,INSERTTIME四列

现要删除2018年年3月26日当天产生的重复数据

2.解决方案

2.1确定无重复数据的记录数

SELECT COUNT(1) FROM (
	SELECT COL1,COL2,COL3,INSERTTIME FROM TEST PARTITION(P201903) A 
		WHERE INSERTTIME >= DATE'2018-02-26' AND INSERTTIME < DATE'2019-04-01'
		GROUP BY COL1,COL2,COL3
);

2.2整理要筛选的数据

由于原表A数据量非常大,这里新建一张表将要处理的数据独立存放

CREATE TABLE TEST_TMP NOLOGGING AS
SELECT /*PARALLEL +8 */ A.*,A.ROWID ROWID_OLD FROM TEST PARTITION(P201903) A 
	WHERE INSERTTIME >= DATE'2018-02-26' AND INSERTTIME < DATE'2019-04-01';

2.2确定要删除的数据

理论上而言要删除和要保留的数据记录数应相等

--需要删除的数据记录数	
SELECT COUNT(1) FROM TEST PARTITION(P201903) A WHERE ROWID IN (
	SELECT MIN(ROWID_OLD) ROWID_OLD FROM TEST_TMP 
	WHERE INSERTTIME >= DATE'2018-02-26' AND INSERTTIME < DATE'2019-04-01' 
	GROUP BY COL1,COL2,COL3,INSERTTIME 
	HAVING COUNT(1) > 1)
AND INSERTTIME >= DATE'2018-02-26' AND INSERTTIME < DATE'2019-04-01'

--需要保留的数据记录数	
SELECT COUNT(1) FROM TEST PARTITION(P201903) A WHERE ROWID NOT IN (
	SELECT MIN(ROWID_OLD) ROWID_OLD FROM TEST_TMP 
	WHERE INSERTTIME >= DATE'2018-02-26' AND INSERTTIME < DATE'2019-04-01' 
	GROUP BY COL1,COL2,COL3,INSERTTIME 
	HAVING COUNT(1) > 1)
AND INSERTTIME >= DATE'2018-02-26' AND INSERTTIME < DATE'2019-04-01'

2.3运用分批去提交删除重复数据

DECLARE
      TYPE ROWID_LIST IS TABLE OF UROWID INDEX BY BINARY_INTEGER;
      ROWID_INFOS ROWID_LIST;
      I NUMBER;
      CURSOR C_ROWIDS IS  (SELECT MIN(ROWID_OLD) ROWID_OLD
                            FROM TEST_TMP 
                           WHERE INSERTTIME >= DATE'2018-02-26' AND INSERTTIME < DATE'2019-04-01' 
                           GROUP BY  COL1,COL2,COL3,INSERTTIME
                           HAVING COUNT(1) > 1);
  BEGIN
      OPEN C_ROWIDS;
   
      LOOP
      --此处LIMIT后的值为分批提交的记录数,可以根据实际情况调整
       FETCH C_ROWIDS BULK COLLECT INTO ROWID_INFOS LIMIT 10000;
  
       FORALL I IN 1..ROWID_INFOS.COUNT
        --如下的DELETE语句为分批提交实际需要执行的部分
        DELETE FROM TEST WHERE ROWID=ROWID_INFOS(I);

       COMMIT;
       EXIT WHEN ROWID_INFOS.COUNT<10000;
    END LOOP;
    CLOSE C_ROWIDS;
 END;

–这里LIMIT后的值为分批去提交的记录数,能够根据具体情况调整

FETCH C_ROWID SBULKCOLLECTINTOROWID_INFOS LIMIT 10000;

FOR ALLI IN 1..ROWID_INFOS.COUNT

–如下的DELETE语句为分批去提交实际要执行的部分

DELETE FROM TEST WHERE ROWID=ROWID_INFOS(I);

COMMIT;

EXIT WHEN ROWID_INFOS.COUNT<10000;

END LOOP;

CLOSE C_ROWIDS;

END;

2.4确定无重复数据

SELECT*FROM(

 SELECT COL1,COL2,COL3,INSERTTIME FROM TESTPARTITION(P201903)A

 WHERE INSERTTIME>=DATE'2018-02-26' AND INSERTTIME GROUP BY COL1,COL2,COL3,INSERTTIME

 HAVING COUNT(1)>1)

);