Snowflake Data Vault Modeling
Data Vault 2.0 is a methodology for designing scalable, auditable, and flexible data warehouses that support historical tracking and parallel loading.
Data Vault Components
Hubs (Business Keys)
-- Create Hub table
CREATE OR REPLACE TABLE dv.hub_customer (
hub_customer_hk BINARY(16) PRIMARY KEY, -- Hash key
customer_bk VARCHAR(50), -- Business key
load_date TIMESTAMP_NTZ, -- Load timestamp
record_source VARCHAR(100) -- Source system
);
-- Load Hub table
INSERT INTO dv.hub_customer (
hub_customer_hk,
customer_bk,
load_date,
record_source
)
SELECT
SHA2(customer_id::STRING, 256) as hub_customer_hk,
customer_id::STRING as customer_bk,
CURRENT_TIMESTAMP() as load_date,
'ERP_SYSTEM' as record_source
FROM staging.customers
WHERE customer_id IS NOT NULL
AND customer_id NOT IN (SELECT customer_bk FROM dv.hub_customer);
Links (Relationships)
-- Create Link table
CREATE OR REPLACE TABLE dv.link_order_customer (
link_order_customer_hk BINARY(16) PRIMARY KEY,
hub_order_hk BINARY(16),
hub_customer_hk BINARY(16),
load_date TIMESTAMP_NTZ,
record_source VARCHAR(100)
);
-- Load Link table
INSERT INTO dv.link_order_customer (
link_order_customer_hk,
hub_order_hk,
hub_customer_hk,
load_date,
record_source
)
SELECT
SHA2(CONCAT(o.hub_order_hk, c.hub_customer_hk), 256) as link_order_customer_hk,
o.hub_order_hk,
c.hub_customer_hk,
CURRENT_TIMESTAMP() as load_date,
'ERP_SYSTEM' as record_source
FROM dv.hub_order o
JOIN staging.order_details od ON o.order_bk = od.order_id::STRING
JOIN dv.hub_customer c ON od.customer_id::STRING = c.customer_bk;
Satellites (Attributes)
-- Create Satellite table
CREATE OR REPLACE TABLE dv.sat_customer_details (
hub_customer_hk BINARY(16),
load_date TIMESTAMP_NTZ,
load_end_date TIMESTAMP_NTZ,
hash_diff BINARY(16), -- Hash of attributes
first_name VARCHAR(100),
last_name VARCHAR(100),
email VARCHAR(255),
phone VARCHAR(50),
address VARCHAR(500),
record_source VARCHAR(100),
PRIMARY KEY (hub_customer_hk, load_date)
);
-- Load Satellite table
INSERT INTO dv.sat_customer_details (
hub_customer_hk,
load_date,
load_end_date,
hash_diff,
first_name,
last_name,
email,
phone,
address,
record_source
)
SELECT
c.hub_customer_hk,
CURRENT_TIMESTAMP() as load_date,
NULL as load_end_date,
SHA2(CONCAT(
COALESCE(s.first_name, ''),
COALESCE(s.last_name, ''),
COALESCE(s.email, ''),
COALESCE(s.phone, ''),
COALESCE(s.address, '')
), 256) as hash_diff,
s.first_name,
s.last_name,
s.email,
s.phone,
s.address,
'ERP_SYSTEM' as record_source
FROM dv.hub_customer c
JOIN staging.customers s ON c.customer_bk = s.customer_id::STRING;
Hash Key Generation
-- Create hash key generation functions
CREATE OR REPLACE FUNCTION dv.hash_business_key(business_key VARCHAR)
RETURNS BINARY(16)
LANGUAGE SQL
AS
$$
SHA2(business_key, 256)
$$;
-- Create hash diff function
CREATE OR REPLACE FUNCTION dv.hash_diff(attributes ARRAY)
RETURNS BINARY(16)
LANGUAGE SQL
AS
$$
SHA2(CONCAT_WS(',', attributes), 256)
$$;
-- Use in loading process
SELECT
dv.hash_business_key(customer_id::STRING) as hub_customer_hk,
dv.hash_diff(ARRAY_CONSTRUCT(first_name, last_name, email)) as hash_diff
FROM staging.customers;
Loading Patterns
Full Load
-- Full load procedure
CREATE OR REPLACE PROCEDURE dv.load_full_customer()
RETURNS STRING
LANGUAGE SQL
AS
$$
BEGIN
-- Truncate staging
TRUNCATE TABLE staging.customers;
-- Load from source
INSERT INTO staging.customers
SELECT * FROM source_system.customers;
-- Load Hub
INSERT INTO dv.hub_customer (
hub_customer_hk, customer_bk, load_date, record_source
)
SELECT
SHA2(customer_id::STRING, 256),
customer_id::STRING,
CURRENT_TIMESTAMP(),
'SOURCE_SYSTEM'
FROM staging.customers
WHERE customer_id IS NOT NULL;
-- Load Satellite
INSERT INTO dv.sat_customer_details (
hub_customer_hk, load_date, load_end_date, hash_diff,
first_name, last_name, email, phone, address, record_source
)
SELECT
SHA2(c.customer_id::STRING, 256),
CURRENT_TIMESTAMP(),
NULL,
SHA2(CONCAT(c.first_name, c.last_name, c.email), 256),
c.first_name, c.last_name, c.email, c.phone, c.address,
'SOURCE_SYSTEM'
FROM staging.customers c;
RETURN 'SUCCESS: Customer data loaded';
END;
$$;
Incremental Load
-- Incremental load using Streams
CREATE STREAM customer_stream
ON TABLE staging.customers
APPEND_ONLY = FALSE;
-- Load Hub incrementally
INSERT INTO dv.hub_customer (
hub_customer_hk, customer_bk, load_date, record_source
)
SELECT
SHA2(customer_id::STRING, 256),
customer_id::STRING,
CURRENT_TIMESTAMP(),
'SOURCE_SYSTEM'
FROM customer_stream
WHERE METADATA$ACTION = 'INSERT'
AND customer_id NOT IN (SELECT customer_bk FROM dv.hub_customer);
-- Load Satellite incrementally
INSERT INTO dv.sat_customer_details (
hub_customer_hk, load_date, load_end_date, hash_diff,
first_name, last_name, email, phone, address, record_source
)
SELECT
SHA2(c.customer_id::STRING, 256),
CURRENT_TIMESTAMP(),
NULL,
SHA2(CONCAT(c.first_name, c.last_name, c.email), 256),
c.first_name, c.last_name, c.email, c.phone, c.address,
'SOURCE_SYSTEM'
FROM customer_stream c
WHERE METADATA$ACTION = 'INSERT';
Data Vault loading is designed for parallel processing. Hubs, Links, and Satellites can be loaded independently, enabling high-throughput data ingestion.
Querying Data Vault
Point-in-Time Queries
-- Query as of specific timestamp
SELECT
c.customer_bk,
s.first_name,
s.last_name,
s.email
FROM dv.hub_customer c
JOIN dv.sat_customer_details s ON c.hub_customer_hk = s.hub_customer_hk
WHERE s.load_date <= '2024-01-15'::TIMESTAMP
AND (s.load_end_date IS NULL OR s.load_end_date > '2024-01-15'::TIMESTAMP);
-- Query current state
SELECT
c.customer_bk,
s.first_name,
s.last_name,
s.email
FROM dv.hub_customer c
JOIN dv.sat_customer_details s ON c.hub_customer_hk = s.hub_customer_hk
WHERE s.load_end_date IS NULL;
Building Business Marts
-- Create business mart
CREATE OR REPLACE VIEW mart.customer_orders AS
SELECT
c.customer_bk,
c.first_name,
c.last_name,
c.email,
COUNT(DISTINCT o.order_bk) as total_orders,
SUM(o.order_amount) as total_spend,
MIN(o.order_date) as first_order_date,
MAX(o.order_date) as last_order_date
FROM dv.hub_customer c
JOIN dv.link_order_customer loc ON c.hub_customer_hk = loc.hub_customer_hk
JOIN dv.hub_order o ON loc.hub_order_hk = o.hub_order_hk
JOIN dv.sat_order_details s ON o.hub_order_hk = s.hub_order_hk
GROUP BY c.customer_bk, c.first_name, c.last_name, c.email;
Data Vault Benefits
| Benefit | Description |
|---|---|
| Auditability | Complete history of all changes |
| Parallel Loading | Independent Hub/Link/Satellite loading |
| Flexibility | Easy to add new sources |
| Scalability | Supports massive data volumes |
| Point-in-Time | Historical query capability |
Summary
Key Takeaways
Data Vault 2.0 uses Hubs, Links, and Satellites for scalable, auditable modeling.
Hubs store business keys, Links store relationships, Satellites store attributes.
Hash keys enable efficient joins and support parallel loading operations.
Business Marts provide analytics-ready denormalized views from Data Vault tables.
Data Vault Components
| Component | Purpose | Contains |
|---|---|---|
| Hub | Business entity | Hash key, business key, metadata |
| Link | Relationships | Hash key, parent hub keys, metadata |
| Satellite | Attributes | Parent hash key, load date, attributes |
| Business Mart | Analytics | Denormalized view of Hub + Link + Satellite |
Implementation Steps
- Create Hub tables for business entities
- Create Link tables for relationships
- Create Satellite tables for attributes
- Implement hash key generation functions
- Build incremental loading with Streams
- Create Business Marts for analytics consumption