UPDATE using Rank(), Row_Number excluding duplicate values

Question

I've a dataset similar to the one below.

I need to update the base lookup table based on the values provided in the updated_CustomerId column. The base tables is the same as the dataset but it does not have updated_CustomerId column.

The challenge here that the base table has a unique constraint based on combination of three columns below: Current_CustomerID
Order_ID OrderCategory

DESIRED OUTPUT:

After the update either one of Old_customerIds (17360410 - Pk 8, 21044488 - Pk = 9) can be reassigned to the Update_CustomerID PrimaryKey 2 will not updated as that would lead to Unique constraint violation, but it will then be deleted along with one of the PrimaryKeys from the above either 8 or 9, depending on which one was updated (re-assigned to the new id)

After everything is updated on the base table I then delete from the base table all records where Current_CustomerID was not re-assigned to the updated_CustomerId (if different)

    IF OBJECT_ID('tempdb..#DataSet') IS NOT NULL
    DROP TABLE #DataSet


    IF OBJECT_ID('tempdb..#BaseTable') IS NOT NULL
    DROP TABLE #BaseTable

CREATE TABLE #DataSet
(
    PrimaryKey INT NOT NULL CONSTRAINT [PK_dataset_ID] PRIMARY KEY,
    Current_CustomerID INT NOT NULL,
    Order_ID INT NOT NULL,
    OrderCategory VARCHAR(50) NOT NULL,
    Updated_CustomerId INT NOT NULL
)
INSERT INTO #DataSet (PrimaryKey,   Current_CustomerID, Order_ID,   OrderCategory,  updated_CustomerId)
VALUES
(1, 17395001,   4451784,    'Kitchen',  25693110),
(2, 25693110,   4451784,    'Kitchen',  25693110),

(3, 25693110,   2083059,    'Kitchen',  25693110),
(4, 25693110,   2163679,    'Kitchen',  25693110),
(5, 25693110,   2171466,    'Kitchen',  25693110),

(6, 25693110,   2163679,    'Bathroom', 25693110),
(7, 25693110,   2171466,    'Bathroom', 25693110),

(8, 17360410,   3377931,    'Furniture',    16303984),
(9, 21044488,   3377931,    'Furniture',    16303984),

(10,    1534323,    2641714,    'Furniture',    16303984),
(11,    16303984,   2641726,    'Furniture',    16303984),
(12,    16303984,   2641793,    'Furniture',    16303984),
(13,    16303984,   2641816,    'Furniture',    16303984),

(14,    16303345,   2641816,    'Garden',   16301239),
(15,    12345678,   1239065,    'Medicine', 1075432)

CREATE TABLE #BaseTable
(
    PrimaryKey INT NOT NULL CONSTRAINT [PK_baseTable_ID] PRIMARY KEY,
    CustomerID INT NOT NULL,
    Order_ID INT NOT NULL,
    OrderCategory VARCHAR(50) NOT NULL,
)

CREATE UNIQUE NONCLUSTERED INDEX [IDX_LookUp] ON #BaseTable
(
    CustomerID ASC,
    Order_ID ASC,
    OrderCategory ASC
) ON [PRIMARY]


INSERT INTO #BaseTable (PrimaryKey, CustomerID, Order_ID,   OrderCategory)
VALUES
(1, 17395001,   4451784,    'Kitchen'),
(2, 25693110,   4451784,    'Kitchen'),

(3, 25693110,   2083059,    'Kitchen'),
(4, 25693110,   2163679,    'Kitchen'),
(5, 25693110,   2171466,    'Kitchen'),

(6, 25693110,   2163679,    'Bathroom'),
(7, 25693110,   2171466,    'Bathroom'),

(8, 17360410,   3377931,    'Furniture'),
(9, 21044488,   3377931,    'Furniture'),

(10,    1534323,    2641714,    'Furniture'),
(11,    16303984,   2641726,    'Furniture'),
(12,    16303984,   2641793,    'Furniture'),
(13,    16303984,   2641816,    'Furniture'),

(14,    16303345,   2641816,    'Garden'),
(15,    12345678,   1239065,    'Medicine')

-- select * from #BaseTable
-- select * from #DataSet

    ; with CTE AS (
    select a.*
        ,rank() over (partition by a.updated_CustomerId, a.Order_ID, a.OrderCategory
                                                 order by a.Current_CustomerID) as flag
    from #DataSet a
    )

with CTE AS (
select a.*
    ,rank() over (partition by a.updated_CustomerId, a.Order_ID, a.OrderCategory order by a.Current_CustomerID) as flag
from #DataSet a
)

update b
set CustomerID = a.Updated_CustomerId
from #BaseTable b 
inner join CTE a on b.PrimaryKey = a.PrimaryKey
where flag <> 2

Msg 2601, Level 14, State 1, Line 82 Cannot insert duplicate key row in object 'dbo.#BaseTable' with unique index 'IDX_LookUp'. The duplicate key value is (25693110, 4451784, Kitchen). The statement has been terminated.

Have you tried any of the techniques used in your other question here: https://stackoverflow.com/questions/58775282/duplicates-removal-using-group-by-rank-row-number — Antoine Hernandez, Nov 11 '19 at 19:41

score 1 · Answer 1 · answered Nov 11 '19 at 20:45

1

I think you just want to get a row_number for the #DataTable, and then delete where there are more than one based on the unique key:

//...

DELETE  bt
FROM    #BaseTable bt
    INNER JOIN (
                SELECT  a.PrimaryKey,
                    a.Updated_CustomerId,
                    a.Order_ID,
                    a.OrderCategory,
                    row = ROW_NUMBER() OVER (PARTITION BY a.Updated_CustomerId, a.Order_ID, a.OrderCategory ORDER BY a.Current_CustomerID)
                FROM    #BaseTable b
                    INNER JOIN #DataSet a
                        ON b.PrimaryKey = a.PrimaryKey
            ) x
        ON bt.PrimaryKey = x.PrimaryKey
        AND x.row > 1

answered Nov 11 '19 at 20:45

Jordan Ryder

2,336
1
24
29

Thanks for your answer.I've already did it that way but wanted to be more conservative, meaning update first before removing anything. – Data Engineer Nov 11 '19 at 21:32
@enigma6205 Okay now I see. It's kind of a design preference. You'd either have to assign temporary keys to the other records, or you'd have to drop the index while you're making the changes. A merge technically allows you to do all of those in one statement, but I'm not sure it would allow for this scenario or not because of the duplicate keys. – Jordan Ryder Nov 11 '19 at 21:51

UPDATE using Rank(), Row_Number excluding duplicate values

1 Answers1