Partition Pruning & Elimination

Difficulty: Hard | Companies: Google, Amazon, Meta, Netflix, Uber

Range Partitioning

-- PostgreSQL declarative partitioning
CREATE TABLE orders (
  order_id BIGINT,
  customer_id INT,
  order_date DATE,
  total_amount DECIMAL(10,2)
) PARTITION BY RANGE (order_date);

-- Create partitions
CREATE TABLE orders_2023_q1 PARTITION OF orders
  FOR VALUES FROM ('2023-01-01') TO ('2023-04-01');

CREATE TABLE orders_2023_q2 PARTITION OF orders
  FOR VALUES FROM ('2023-04-01') TO ('2023-07-01');

CREATE TABLE orders_2023_q3 PARTITION OF orders
  FOR VALUES FROM ('2023-07-01') TO ('2023-10-01');

CREATE TABLE orders_2023_q4 PARTITION OF orders
  FOR VALUES FROM ('2023-10-01') TO ('2024-01-01');

ℹ️

Key Insight: Partition pruning automatically eliminates partitions that don't match the query's WHERE clause. This dramatically reduces I/O by only scanning relevant partitions.

Partition Pruning Verification

-- Verify partition pruning is working
EXPLAIN (ANALYZE, BUFFERS)
SELECT * FROM orders
WHERE order_date >= '2023-04-01'
  AND order_date < '2023-07-01';

-- Check pruning statistics
SELECT
  schemaname,
  tablename,
  pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS size
FROM pg_tables
WHERE tablename LIKE 'orders_%'
ORDER BY tablename;

Hash Partitioning

-- Hash partitioning for even distribution
CREATE TABLE users (
  user_id BIGINT,
  username VARCHAR(100),
  email VARCHAR(255)
) PARTITION BY HASH (user_id);

-- Create hash partitions
CREATE TABLE users_p0 PARTITION OF users
  FOR VALUES WITH (MODULUS 4, REMAINDER 0);

CREATE TABLE users_p1 PARTITION OF users
  FOR VALUES WITH (MODULUS 4, REMAINDER 1);

CREATE TABLE users_p2 PARTITION OF users
  FOR VALUES WITH (MODULUS 4, REMAINDER 2);

CREATE TABLE users_p3 PARTITION OF users
  FOR VALUES WITH (MODULUS 4, REMAINDER 3);

List Partitioning

-- List partitioning for categorical data
CREATE TABLE sales (
  sale_id BIGINT,
  region VARCHAR(50),
  amount DECIMAL(10,2),
  sale_date DATE
) PARTITION BY LIST (region);

CREATE TABLE sales_us PARTITION OF sales
  FOR VALUES IN ('US_EAST', 'US_WEST', 'US_CENTRAL');

CREATE TABLE sales_eu PARTITION OF sales
  FOR VALUES IN ('EU_WEST', 'EU_EAST', 'EU_NORTH');

CREATE TABLE sales_apac PARTITION OF sales
  FOR VALUES IN ('APAC_EAST', 'APAC_WEST');

BigQuery Partitioning

-- BigQuery time-unit partitioning
CREATE TABLE `project.dataset.events` (
  event_id STRING,
  event_time TIMESTAMP,
  user_id STRING,
  event_type STRING
)
PARTITION BY DATE(event_time)
OPTIONS (
  require_partition_filter = true
);

-- BigQuery ingestion-time partitioning
CREATE TABLE `project.dataset.logs` (
  log_id STRING,
  message STRING
)
PARTITION BY DATE(_PARTITIONTIME);

-- Query with partition filter
SELECT *
FROM `project.dataset.events`
WHERE event_time >= '2024-01-01'
  AND event_time < '2024-02-01';

⚠️

Important: BigQuery's require_partition_filter = true option forces queries to include a partition filter, preventing full-table scans and reducing costs.

Partition Pruning Patterns

-- Good: Pruning works
SELECT * FROM orders
WHERE order_date = '2023-06-15';

-- Good: Range pruning
SELECT * FROM orders
WHERE order_date BETWEEN '2023-04-01' AND '2023-06-30';

-- Bad: Pruning fails (function on partition column)
SELECT * FROM orders
WHERE EXTRACT(YEAR FROM order_date) = 2023;

-- Good: Rewrite for pruning
SELECT * FROM orders
WHERE order_date >= '2023-01-01'
  AND order_date < '2024-01-01';

Composite Partitioning

-- Subpartitioning (partition of partition)
CREATE TABLE orders (
  order_id BIGINT,
  customer_id INT,
  order_date DATE,
  region VARCHAR(50)
) PARTITION BY RANGE (order_date);

-- First level: by date
CREATE TABLE orders_2023 PARTITION OF orders
  FOR VALUES FROM ('2023-01-01') TO ('2024-01-01')
  PARTITION BY LIST (region);

-- Second level: by region within year
CREATE TABLE orders_2023_us PARTITION OF orders_2023
  FOR VALUES IN ('US_EAST', 'US_WEST');

Partition Maintenance

-- Create new partition in advance
CREATE TABLE orders_2024_q1 PARTITION OF orders
  FOR VALUES FROM ('2024-01-01') TO ('2024-04-01');

-- Detach old partition
ALTER TABLE orders DETACH PARTITION orders_2022_q1;

-- Drop old partition
DROP TABLE orders_2022_q1;

-- Attach existing table as partition
CREATE TABLE orders_old (
  LIKE orders INCLUDING ALL
);

ALTER TABLE orders ATTACH PARTITION orders_old
  FOR VALUES FROM ('2022-01-01') TO ('2023-01-01');

Default Partition

-- Default partition catches all unmatched rows
CREATE TABLE orders_default PARTITION OF orders DEFAULT;

-- This ensures no rows are lost
INSERT INTO orders (order_id, order_date)
VALUES (1, '2025-01-01');  -- Goes to default partition

Partition-wise JOIN

-- Enable partition-wise join
SET enable_partitionwise_join = on;

-- Query joins partitioned tables
SELECT
  o.order_id,
  c.customer_name
FROM orders o
INNER JOIN customers c ON o.customer_id = c.customer_id
WHERE o.order_date >= '2023-01-01'
  AND o.order_date < '2024-01-01';

Partition Statistics

-- Analyze partition statistics
SELECT
  schemaname,
  tablename,
  n_live_tup,
  n_dead_tup,
  last_vacuum,
  last_autovacuum
FROM pg_stat_user_tables
WHERE tablename LIKE 'orders_%'
ORDER BY tablename;

-- Check partition sizes
SELECT
  partition,
  pg_size_pretty(pg_total_relation_size partition) AS size
FROM (
  SELECT
    schemaname||'.'||tablename AS partition
  FROM pg_tables
  WHERE tablename LIKE 'orders_%'
) sub;

Follow-Up Questions

When would you choose hash partitioning over range partitioning?
How does partition pruning interact with index usage?
What's the impact of partitioning on INSERT performance?
How do you handle cross-partition queries efficiently?
Explain the concept of partition elimination in distributed databases.
How do you manage partition lifecycle in a data warehouse?