I am trying to implement a Kimball data mart that uses -1 and -2 rows in a dimension table for late arrival dimensions and for a business key NULL. I have an example code below that creates one staging table for fact and dimension data, two dimension tables and one fact table for data mart. Here is my sample data code in SQL:
--drop table stg_sales --go CREATE TABLE dbo.stg_sales ( stg_sales_id INT IDENTITY(1, 1) NOT NULL, sales_number INT NOT NULL, sales_amt INT NULL, cust_number INT NULL, cust_firstname NVARCHAR(50) NULL, cust_lastname NVARCHAR(100) NULL, cust_address NVARCHAR(500) NULL, salesperson_number INT NULL, CONSTRAINT pk_stg_sales PRIMARY KEY (stg_sales_id) ) go INSERT stg_sales (sales_number, sales_amt, cust_number, cust_firstname, cust_lastname, cust_address, salesperson_number) VALUES (123, 434, 2342, 'Jim', 'Moriaty', 'something', 23) INSERT stg_sales (sales_number, sales_amt, cust_number, cust_firstname, cust_lastname, cust_address, salesperson_number) VALUES (124, 234, 2342, 'Jim', 'Moriaty', 'something', 23) INSERT stg_sales (sales_number, sales_amt, cust_number, cust_firstname, cust_lastname, cust_address, salesperson_number) VALUES (125, 434, 4545, 'Joe', 'Esk', 'someother', 24) INSERT stg_sales (sales_number, sales_amt, cust_number, cust_firstname, cust_lastname, cust_address, salesperson_number) VALUES (126, 434, 5555, 'Daniel', 'Hart', 'Someaddr', NULL) --salesperson_number business key missing here INSERT stg_sales (sales_number, sales_amt, cust_number, cust_firstname, cust_lastname, cust_address, salesperson_number) VALUES (127, 333, 4444, 'Pat', 'Smith', 'Someaddr', 30) SELECT * FROM stg_sales --create a dimension and fact tables --drop table dbo.dim_customer --go CREATE TABLE dbo.dim_customer ( customer_wid INT IDENTITY(1, 1) NOT NULL, cust_number INT NULL, cust_firstname NVARCHAR(50) NULL, cust_lastname NVARCHAR(100) NULL, cust_address NVARCHAR(500) NULL, date_insert DATETIME2 NOT NULL DEFAULT (Getdate()), date_update DATETIME2 NULL, is_current BIT NOT NULL CONSTRAINT pk_dim_customer PRIMARY KEY (customer_wid) CONSTRAINT chk_is_current CHECK (is_current IN (0, 1)) ) go SET IDENTITY_INSERT dbo.dim_customer ON INSERT dbo.dim_customer (customer_wid, cust_number, cust_firstname, cust_lastname, cust_address, date_insert, date_update, is_current) VALUES (-1, -1, 'unknown', 'unknown', 'unknown', Getdate(), Getdate(), 1) INSERT dbo.dim_customer (customer_wid, cust_number, cust_firstname, cust_lastname, cust_address, date_insert, date_update, is_current) VALUES (-2, -2, 'Error', 'Error', 'Error', Getdate(), Getdate(), 1) SET IDENTITY_INSERT dbo.dim_customer OFF --insert data into dimension table INSERT dbo.dim_customer (cust_number, cust_firstname, cust_lastname, cust_address, is_current) SELECT DISTINCT cust_number, cust_firstname, cust_lastname, cust_address, 1 AS is_current FROM dbo.stg_sales WHERE cust_number <> 4444 --left one record off to simulate the situation where you don't have corrensponding row in dim table (late arriving dimension) SELECT * FROM dbo.dim_customer DROP TABLE dbo.dim_salesperson --create salesperson table CREATE TABLE dbo.dim_salesperson ( salesperson_wid INT IDENTITY(1, 1) NOT NULL, salesperson_number INT NULL, salesperson_firstname NVARCHAR(50) NULL, salesperson_lastname NVARCHAR(100) NULL, salesperson_address NVARCHAR(500) NULL, date_insert DATETIME2 NOT NULL DEFAULT (Getdate()), date_update DATETIME2 NULL, is_current BIT NOT NULL CONSTRAINT pk_dim_salesperson PRIMARY KEY (salesperson_wid) CONSTRAINT chk_dim_salesperson_is_current CHECK (is_current IN (0, 1)) ) go SET IDENTITY_INSERT dbo.dim_salesperson ON INSERT dbo.dim_salesperson (salesperson_wid, salesperson_number, salesperson_firstname, salesperson_lastname, salesperson_address, date_insert, date_update, is_current) VALUES (-1, -1, 'Not available', 'Not available', 'Not available', Getdate(), Getdate(), 1) INSERT dbo.dim_salesperson (salesperson_wid, salesperson_number, salesperson_firstname, salesperson_lastname, salesperson_address, date_insert, date_update, is_current) VALUES (-2, -2, 'Error', 'Error', 'Error', Getdate(), Getdate(), 1) SET IDENTITY_INSERT dbo.dim_salesperson OFF --insert data into salesperson INSERT dbo.dim_salesperson (salesperson_number, salesperson_firstname, salesperson_lastname, salesperson_address, is_current) VALUES (23, 'John', 'Fox', 'something', 1) INSERT dbo.dim_salesperson (salesperson_number, salesperson_firstname, salesperson_lastname, salesperson_address, is_current) VALUES (24, 'Hadley', 'Fox', 'something', 1) INSERT dbo.dim_salesperson (salesperson_number, salesperson_firstname, salesperson_lastname, salesperson_address, is_current) VALUES (30, 'Ashley', 'Fox', 'something', 1) SELECT * FROM dbo.dim_salesperson SELECT * FROM dbo.stg_sales --create and populate the fact table --drop table dbo.f_sales --go CREATE TABLE dbo.f_sales ( sales_number INT NOT NULL, customer_wid INT NOT NULL, salesperson_wid INT NOT NULL, sales_amt INT NULL CONSTRAINT pk_f_sales PRIMARY KEY (sales_number) CONSTRAINT fk_customer_wid FOREIGN KEY (customer_wid) REFERENCES dbo.dim_customer(customer_wid), CONSTRAINT fk_salesperson_wid FOREIGN KEY (salesperson_wid) REFERENCES dbo.dim_salesperson(salesperson_wid) ) --populate the fact table INSERT dbo.f_sales (sales_number, customer_wid, salesperson_wid, sales_amt) SELECT stg.sales_number, Isnull(dimcust.customer_wid, -1) AS customer_wid, --this is maybe correct way to assign -1 foreign key when there is no corresponding dimension row in the dim table Isnull(dimsp.salesperson_wid, -2) AS salesperson_wid, --NOT CORRECT, how to assign -2 foreign key when the business key is NULL in the source? stg.sales_amt FROM dbo.stg_sales AS stg LEFT JOIN dbo.dim_customer AS dimcust ON stg.cust_number = dimcust.cust_number LEFT JOIN dbo.dim_salesperson AS dimsp ON stg.salesperson_number = dimsp.salesperson_number SELECT * FROM dbo.f_sales
How can I assign -2 to rows that do not have a business key in the source system. You can learn more about the theory of this implementation from Kimball:
This is basically what I'm trying to achieve:
NULL processing in dimensions and dimensions
Edit:
I think I can use COALESCE or ISNULL in the left join, it seems to give the correct result:
INSERT dbo.f_sales (sales_number, customer_wid, salesperson_wid, sales_amt) SELECT stg.sales_number, Isnull(dimcust.customer_wid, -1) AS customer_wid, --this is maybe correct way to assign -1 foreign key when there is no corresponding dimension row in the dim table dimsp.salesperson_wid, stg.sales_amt FROM dbo.stg_sales AS stg LEFT JOIN dbo.dim_customer AS dimcust ON COALESCE(stg.cust_number, -2) = dimcust.cust_number LEFT JOIN dbo.dim_salesperson AS dimsp ON COALESCE(stg.salesperson_number, -2) = dimsp.salesperson_number