Temporal Queries & SCD
Difficulty: Hard | Companies: Google, Amazon, Meta, Netflix, Uber
SCD Type 2 Implementation
-- Slowly Changing Dimension Type 2
CREATE TABLE dim_customer (
customer_key SERIAL PRIMARY KEY,
customer_id INT,
customer_name VARCHAR(255),
email VARCHAR(255),
city VARCHAR(100),
effective_date DATE,
expiry_date DATE,
is_current BOOLEAN DEFAULT true
);
-- Insert new version on change
INSERT INTO dim_customer (
customer_id, customer_name, email, city,
effective_date, expiry_date, is_current
)
VALUES (
101, 'John Updated', 'john.new@email.com', 'New York',
CURRENT_DATE, '9999-12-31', true
);
-- Expire old record
UPDATE dim_customer
SET expiry_date = CURRENT_DATE - INTERVAL '1' DAY,
is_current = false
WHERE customer_id = 101
AND is_current = true;
βΉοΈ
Key Insight: SCD Type 2 preserves full history by creating new records for each change. Query with is_current = true for latest state, or use date range for point-in-time analysis.
Temporal Query with Date Range
-- Find active records for a specific date
SELECT
c.customer_id,
c.customer_name,
c.city,
c.effective_date,
c.expiry_date
FROM dim_customer c
WHERE c.customer_id = 101
AND '2024-06-15'::DATE >= c.effective_date
AND '2024-06-15'::DATE < c.expiry_date;
BigQuery Temporal Tables
-- BigQuery SYSTEM_TIME temporal table
CREATE TABLE `project.dataset.orders` (
order_id INT64,
customer_id INT64,
total_amount NUMERIC,
status STRING,
valid_from TIMESTAMP,
valid_to TIMESTAMP
)
OPTIONS (
enable_logical_deletion = false
);
-- Query as of specific timestamp
SELECT *
FROM `project.dataset.orders`
FOR SYSTEM_TIME AS OF '2024-06-15 10:00:00'
WHERE order_id = 12345;
-- Query between two timestamps
SELECT *
FROM `project.dataset.orders`
FOR SYSTEM_TIME BETWEEN '2024-01-01' AND '2024-06-30'
WHERE customer_id = 101;
Gap Filling for Time Series
-- Fill missing dates in time series
WITH date_range AS (
SELECT generate_series(
'2024-01-01'::DATE,
'2024-12-31'::DATE,
'1 day'::INTERVAL
)::DATE AS sale_date
)
SELECT
dr.sale_date,
COALESCE(daily.total_sales, 0) AS total_sales,
COALESCE(daily.order_count, 0) AS order_count
FROM date_range dr
LEFT JOIN (
SELECT
sale_date,
SUM(amount) AS total_sales,
COUNT(*) AS order_count
FROM sales
WHERE sale_date >= '2024-01-01'
AND sale_date <= '2024-12-31'
GROUP BY sale_date
) daily ON dr.sale_date = daily.sale_date
ORDER BY dr.sale_date;
Window Functions for Temporal Analysis
-- Calculate month-over-month growth
WITH monthly_sales AS (
SELECT
DATE_TRUNC('month', sale_date)::DATE AS month,
SUM(amount) AS total_sales
FROM sales
GROUP BY 1
)
SELECT
month,
total_sales,
LAG(total_sales) OVER (ORDER BY month) AS prev_month_sales,
total_sales - LAG(total_sales) OVER (ORDER BY month) AS absolute_growth,
ROUND(
(total_sales - LAG(total_sales) OVER (ORDER BY month)) * 100.0 /
NULLIF(LAG(total_sales) OVER (ORDER BY month), 0),
2
) AS growth_rate_pct
FROM monthly_sales
ORDER BY month;
SCD Type 1 vs Type 2 Comparison
-- SCD Type 1: Overwrite (no history)
UPDATE dim_customer
SET city = 'San Francisco',
last_modified = CURRENT_TIMESTAMP
WHERE customer_id = 101;
-- SCD Type 2: Preserve history
-- Step 1: Expire current
UPDATE dim_customer
SET expiry_date = CURRENT_DATE,
is_current = false
WHERE customer_id = 101
AND is_current = true;
-- Step 2: Insert new version
INSERT INTO dim_customer (
customer_id, customer_name, email, city,
effective_date, expiry_date, is_current
)
SELECT
customer_id, customer_name, email, 'San Francisco',
CURRENT_DATE, '9999-12-31', true
FROM dim_customer
WHERE customer_id = 101
AND is_current = false
ORDER BY effective_date DESC
LIMIT 1;
Event Sequencing with Gaps
-- Find gaps in event sequences
WITH numbered_events AS (
SELECT
event_id,
event_type,
event_time,
LAG(event_time) OVER (
PARTITION BY event_type ORDER BY event_time
) AS prev_event_time
FROM events
)
SELECT
event_type,
prev_event_time AS gap_start,
event_time AS gap_end,
event_time - prev_event_time AS gap_duration,
CASE
WHEN event_time - prev_event_time > INTERVAL '1 hour'
THEN 'LARGE_GAP'
ELSE 'NORMAL'
END AS gap_type
FROM numbered_events
WHERE prev_event_time IS NOT NULL
AND event_time - prev_event_time > INTERVAL '30 minutes';
Temporal Joins
-- Join two temporal tables for point-in-time analysis
SELECT
o.order_id,
o.order_date,
c.customer_name,
c.city,
p.product_name,
p.category
FROM orders o
INNER JOIN dim_customer c
ON o.customer_id = c.customer_id
AND o.order_date >= c.effective_date
AND o.order_date < c.expiry_date
INNER JOIN dim_product p
ON o.product_id = p.product_id
AND o.order_date >= p.effective_date
AND o.order_date < p.expiry_date;
Sessionization
-- Group events into sessions (30-min inactivity = new session)
WITH events_with_gap AS (
SELECT
user_id,
event_time,
event_type,
LAG(event_time) OVER (
PARTITION BY user_id ORDER BY event_time
) AS prev_event_time
FROM events
),
session_assignments AS (
SELECT
*,
SUM(
CASE
WHEN event_time - prev_event_time > INTERVAL '30 minutes'
OR prev_event_time IS NULL
THEN 1
ELSE 0
END
) OVER (
PARTITION BY user_id ORDER BY event_time
) AS session_id
FROM events_with_gap
)
SELECT
user_id,
session_id,
MIN(event_time) AS session_start,
MAX(event_time) AS session_end,
MAX(event_time) - MIN(event_time) AS session_duration,
COUNT(*) AS event_count
FROM session_assignments
GROUP BY user_id, session_id;
Time Bucketing
-- Bucket time intervals into fixed-size windows
SELECT
DATE_TRUNC('hour', event_time) AS hour_bucket,
COUNT(*) AS event_count,
COUNT(DISTINCT user_id) AS unique_users
FROM events
WHERE event_time >= '2024-06-01'
AND event_time < '2024-07-01'
GROUP BY DATE_TRUNC('hour', event_time)
ORDER BY hour_bucket;
β οΈ
Common Mistake: When using DATE_TRUNC, ensure your timezone handling is consistent. Use AT TIME ZONE to convert timestamps to the desired timezone before truncating.
Follow-Up Questions
- How do you handle timezone conversions in temporal queries?
- What's the difference between SCD Type 1, 2, and 3?
- How would you implement a Type 6 SCD (hybrid approach)?
- Explain the concept of bitemporal data and how to query it.
- How do you efficiently gap-fill time series data with irregular intervals?
- What's the best approach for handling daylight saving time transitions in temporal queries?