Materialized Views & Caching
Difficulty: Hard | Companies: Google, Amazon, Meta, Netflix, Uber
Materialized View Fundamentals
-- Create materialized view
CREATE MATERIALIZED VIEW mv_monthly_sales AS
SELECT
DATE_TRUNC('month', order_date)::DATE AS month,
region,
product_category,
SUM(amount) AS total_sales,
COUNT(*) AS order_count,
COUNT(DISTINCT customer_id) AS unique_customers
FROM orders
INNER JOIN products ON orders.product_id = products.product_id
GROUP BY 1, 2, 3;
-- Query the materialized view (fast!)
SELECT * FROM mv_monthly_sales
WHERE month = '2024-01-01'
AND region = 'US';
βΉοΈ
Key Insight: Materialized views store the result of a query physically. They're excellent for expensive aggregations that are queried frequently. Unlike regular views, they don't update automatically.
Refresh Strategies
-- Full refresh (blocking)
REFRESH MATERIALIZED VIEW mv_monthly_sales;
-- Concurrent refresh (non-blocking, requires unique index)
REFRESH MATERIALIZED VIEW CONCURRENTLY mv_monthly_sales;
-- Refresh with data until refresh completes
CREATE MATERIALIZED VIEW mv_sales WITH DATA AS
SELECT ...;
-- Create without data, populate later
CREATE MATERIALIZED VIEW mv_sales WITHOUT DATA AS
SELECT ...;
REFRESH MATERIALIZED VIEW mv_sales;
Concurrent Refresh with Unique Index
-- Create unique index for concurrent refresh
CREATE UNIQUE INDEX idx_mv_monthly_sales
ON mv_monthly_sales (month, region, product_category);
-- Now concurrent refresh is possible
REFRESH MATERIALIZED VIEW CONCURRENTLY mv_monthly_sales;
β οΈ
Important: Concurrent refresh requires a UNIQUE index. Without it, you must use non-concurrent refresh which locks the view.
BigQuery Materialized Views
-- BigQuery materialized view
CREATE MATERIALIZED VIEW `project.dataset.mv_daily_sales`
OPTIONS (
enable_refresh = true,
refresh_interval_minutes = 60
)
AS
SELECT
DATE(order_date) AS sale_date,
product_category,
SUM(amount) AS total_sales,
COUNT(*) AS order_count
FROM `project.dataset.orders`
GROUP BY 1, 2;
-- Check refresh status
SELECT * FROM `project.dataset.INFORMATION_SCHEMA.MATERIALIZED_VIEWS`;
Incremental Refresh Pattern
-- Track changes for incremental refresh
CREATE TABLE mv_refresh_log (
mv_name VARCHAR(100),
last_refresh TIMESTAMP,
PRIMARY KEY (mv_name)
);
-- Incremental refresh procedure
CREATE OR REPLACE PROCEDURE refresh_mv_incremental(mv_name TEXT)
LANGUAGE plpgsql
AS $$
DECLARE
last_refresh_time TIMESTAMP;
BEGIN
-- Get last refresh time
SELECT last_refresh INTO last_refresh_time
FROM mv_refresh_log
WHERE mv_name = $1;
-- Refresh materialized view
REFRESH MATERIALIZED VIEW CONCURRENTLY mv_monthly_sales;
-- Update refresh log
INSERT INTO mv_refresh_log (mv_name, last_refresh)
VALUES ($1, CURRENT_TIMESTAMP)
ON CONFLICT (mv_name)
DO UPDATE SET last_refresh = CURRENT_TIMESTAMP;
END;
$$;
Indexing Materialized Views
-- Create indexes on materialized views
CREATE INDEX idx_mv_sales_month ON mv_monthly_sales (month);
CREATE INDEX idx_mv_sales_region ON mv_monthly_sales (region, month);
CREATE INDEX idx_mv_sales_category ON mv_monthly_sales (product_category);
-- Check index sizes
SELECT
indexname,
pg_size_pretty(pg_relation_size(indexname::regclass)) AS size
FROM pg_indexes
WHERE tablename = 'mv_monthly_sales';
Query Caching Patterns
-- Cache expensive computations
CREATE MATERIALIZED VIEW mv_user_stats AS
SELECT
user_id,
COUNT(*) AS total_orders,
SUM(amount) AS total_spent,
AVG(amount) AS avg_order_value,
MIN(order_date) AS first_order,
MAX(order_date) AS last_order,
AVG(EXTRACT(DAY FROM order_date - LAG(order_date)
OVER (ORDER BY order_date))) AS avg_days_between_orders
FROM orders
GROUP BY user_id;
-- Refresh schedule (e.g., daily)
REFRESH MATERIALIZED VIEW CONCURRENTLY mv_user_stats;
Partitioned Materialized Views
-- Partition materialized view by date
CREATE MATERIALIZED VIEW mv_orders_daily
PARTITION BY RANGE (sale_date) AS
SELECT
DATE(order_date) AS sale_date,
product_id,
SUM(amount) AS daily_sales
FROM orders
GROUP BY 1, 2;
-- Create partitions
CREATE TABLE mv_orders_daily_2024_q1
PARTITION OF mv_orders_daily
FOR VALUES FROM ('2024-01-01') TO ('2024-04-01');
Conditional Materialization
-- Create materialized view with filter
CREATE MATERIALIZED VIEW mv_high_value_orders AS
SELECT
order_id,
customer_id,
total_amount,
order_date
FROM orders
WHERE total_amount > 10000 -- Only high-value orders
AND order_date > CURRENT_DATE - INTERVAL '1 year';
-- Index for fast lookups
CREATE INDEX idx_mv_hvo_customer ON mv_high_value_orders (customer_id);
Materialized View Refresh Monitoring
-- Monitor refresh status
SELECT
schemaname,
matviewname,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||matviewname)) AS size,
last_refresh
FROM pg_stat_user_matviews
ORDER BY pg_total_relation_size(schemaname||'.'||matviewname) DESC;
-- Check refresh frequency
SELECT
mv_name,
last_refresh,
CURRENT_TIMESTAMP - last_refresh AS time_since_refresh
FROM mv_refresh_log;
Auto-Refresh with pg_cron
-- Schedule automatic refresh using pg_cron
CREATE EXTENSION IF NOT EXISTS pg_cron;
-- Refresh daily at 2 AM
SELECT cron.schedule(
'refresh-mv-monthly',
'0 2 * * *',
'REFRESH MATERIALIZED VIEW CONCURRENTLY mv_monthly_sales'
);
-- Check scheduled jobs
SELECT * FROM cron.job;
Data Warehouse Optimization
-- Star schema materialized views
CREATE MATERIALIZED VIEW mv_fact_sales_summary AS
SELECT
d.date_key,
p.product_key,
c.customer_key,
s.store_key,
SUM(f.sales_amount) AS total_sales,
SUM(f.quantity) AS total_quantity,
COUNT(*) AS transaction_count
FROM fact_sales f
INNER JOIN dim_date d ON f.date_id = d.date_id
INNER JOIN dim_product p ON f.product_id = p.product_id
INNER JOIN dim_customer c ON f.customer_id = c.customer_id
INNER JOIN dim_store s ON f.store_id = s.store_id
GROUP BY 1, 2, 3, 4;
-- Create composite index
CREATE INDEX idx_mv_fss_composite
ON mv_fact_sales_summary (date_key, product_key, customer_key, store_key);
Nested Materialized Views
-- Create base materialized view
CREATE MATERIALIZED VIEW mv_base_sales AS
SELECT
order_date,
product_id,
SUM(amount) AS daily_sales
FROM orders
GROUP BY 1, 2;
-- Create derived materialized view
CREATE MATERIALIZED VIEW mv_weekly_sales AS
SELECT
DATE_TRUNC('week', order_date)::DATE AS week,
product_id,
SUM(daily_sales) AS weekly_sales
FROM mv_base_sales
GROUP BY 1, 2;
-- Refresh base first, then derived
REFRESH MATERIALIZED VIEW mv_base_sales;
REFRESH MATERIALIZED VIEW mv_weekly_sales;
Follow-Up Questions
- When should you use materialized views vs regular views?
- How do you handle stale data in materialized views?
- What's the performance impact of concurrent vs non-concurrent refresh?
- How would you implement incremental refresh for large materialized views?
- Explain the trade-offs between refresh frequency and query performance.
- How do you manage materialized views in a data warehouse environment?