How to group similar rows in SQL Server

I have a table like this:

Date        ConfigID    ItemID    ClientName    Metric1    Metric2
====        ========    ======    ==========    =======    =======
2017-01-01  1           1         A             2.0        2.0
2017-01-01  3           1         A             2.0        2.0
2017-01-01  4           2         B             5.0        5.0
2017-01-02  4           3         A             6.0        6.0
2017-01-01  2           1         A             2.0        2.0
....
(20 million rows here)

I wanted to find duplicates based here Date, ItemID, ClientName, Metric1and Metric2, so I wrote:

CREATE TABLE MyTable ([Date] date,
                      ConfigID int,
                      ItemID int,
                      ClientName char(1),
                      Metric1 decimal(3,1),
                      Metric2 decimal(3,1));
INSERT INTO MyTable
VALUES ('2017-01-01',1,1,'A',2.0,2.0),
       ('2017-01-01',3,1,'A',2.0,2.0),
       ('2017-01-01',4,2,'B',5.0,5.0),
       ('2017-01-02',4,3,'A',6.0,6.0),
       ('2017-01-01',2,1,'A',2.0,2.0);    

WITH Dupes          
AS (            
    SELECT *        
        ,ROW_NUMBER() OVER (    
            PARTITION BY 
                [Date]
               ,[ItemID]
               ,[ClientName]
               ,[Metric1]
               ,[Metric2]
            ORDER BY [Date] DESC
    ) AS RowNum 
    FROM myTable)

SELECT *
FROM Dupes

But this returns something like this:

Date        ConfigID    ItemID    ClientName    Metric1    Metric2    RowNum
====        ========    ======    ==========    =======    =======    ======
2017-01-01  1           1         A             2.0        2.0        1
2017-01-01  3           1         A             2.0        2.0        2
2017-01-01  4           2         B             5.0        5.0        1
2017-01-02  4           3         A             6.0        6.0        1
2017-01-01  2           1         A             2.0        2.0        3
....
(20 million rows here)

I would like to group similar elements based on a sentence PARTITION BY. In other words, I would like to see something like this (I really don't need to RowNum):

Date        ConfigID    ItemID    ClientName    Metric1    Metric2    RowNum
====        ========    ======    ==========    =======    =======    ======
2017-01-01  1           1         A             2.0        2.0        1
2017-01-01  3           1         A             2.0        2.0        2
2017-01-01  2           1         A             2.0        2.0        3
2017-01-01  4           2         B             5.0        5.0        1
2017-01-02  4           3         A             6.0        6.0        1
....
(20 million rows here)

Which SQL query will help me group duplicate / similar rows in a table? Thank you for your suggestions and answers!

+4
source share
4 answers

Using DENSE_RANK instead of ROW_NUMBER can help?

;           
WITH Dupes          
AS (            
    SELECT *        
        ,DENSE_RANK ( )
        OVER (    
            ORDER BY
                [Date]
               ,[ItemID]
               ,[ClientName]
               ,[Metric1]
               ,[Metric2]           
             DESC
    ) AS GroupID 
    FROM myTable)

SELECT *
FROM Dupes

here is the suggested solution:

;           
WITH D1          
AS (            
    SELECT *        
        ,DENSE_RANK ( )
        OVER (    
            ORDER BY
                [Date]
               ,[ItemID]
               ,[ClientName]
               ,[Metric1]
               ,[Metric2]           
             DESC
    ) AS GroupID 
    FROM myTable)
, Dupes AS (
    SELECT *
        , COUNT(*) OVER (PARTITION BY GroupID) AS GroupItemsCount
    FROM D1
)
SELECT *
FROM Dupes
WHERE GroupItemsCount <> 1

but the best way could be

;           
WITH Dupes          
AS (            
    SELECT *        
        ,COUNT(*)
        OVER (    
            partition BY
                [Date]
               ,[ItemID]
               ,[ClientName]
               ,[Metric1]
               ,[Metric2]           
    ) AS GroupItemsCount
    FROM myTable)

SELECT *
FROM Dupes
WHERE GroupItemsCount > 1
+1
source

;           
WITH Dupes          
AS (            
    SELECT *        
        ,ROW_NUMBER() OVER (    
            PARTITION BY 
                [Date]
               ,[ItemID]
               ,[ClientName]
               ,[Metric1]
               ,[Metric2]
            ORDER BY [Date] DESC
    ) AS RowNum 
    FROM myTable)

SELECT *
FROM Dupes
order by [Date]
               ,[ItemID]
               ,[ClientName]
               ,[Metric1]
               ,[Metric2],
RowNum 
+1

, order by. CTE :

. . .
SELECT *
FROM Dupes
ORDER BY [Date], [ItemID], [ClientName], [Metric1], [Metric2];
+1

Based on Luca's suggestion in the comment, use COUNT(*) PARTITION BY(...)seems to work:

CREATE TABLE MyTable ([Date] date,
                      ConfigID int,
                      ItemID int,
                      ClientName char(1),
                      Metric1 decimal(3,1),
                      Metric2 decimal(3,1));
INSERT INTO MyTable
VALUES ('2017-01-01',1,1,'A',2.0,2.0),
       ('2017-01-01',3,1,'A',2.0,2.0),
       ('2017-01-01',4,2,'B',5.0,5.0),
       ('2017-01-02',4,3,'A',6.0,6.0),
       ('2017-01-01',2,1,'A',2.0,2.0);    

WITH Dupes          
AS (            
    SELECT *        
        ,COUNT(*) OVER (    
            PARTITION BY 
                [Date]
               ,[ItemID]
               ,[ClientName]
               ,[Metric1]
               ,[Metric2]
            ORDER BY [Date] DESC
    ) AS DupeCount 
    FROM myTable)

SELECT *
FROM Dupes
WHERE DupeCount > 1
+1
source

Source: https://habr.com/ru/post/1690720/


All Articles