BigQuery Deep Dive
BigQuery Architecture
BigQuery is a serverless, highly scalable, and cost-effective multi-cloud data warehouse. It separates storage and compute, allowing independent scaling.
Core Components
SQL Interface:
- Parses SQL queries and creates execution plans
- Supports standard SQL (ANSI 2011 compliant)
- Provides query validation and optimization
Dremel Execution Engine:
- Distributed query execution system
- Converts queries into execution trees
- Processes petabytes of data in seconds
- Uses tree-architecture for parallel processing
Capacitor Storage Format:
- Columnar storage format optimized for analytics
- Supports automatic compression and encoding
- Enables fast data scanning and filtering
- Handles nested and repeated fields
Colossus Distributed Storage:
- Google's distributed file system
- Provides automatic replication and fault tolerance
- Handles data placement and load balancing
- Supports SSD and HDD storage classes
Jupiter Network Fabric:
- High-bandwidth, low-latency network
- Connects compute and storage resources
- Provides petabits of bandwidth
- Enables fast data movement between zones
Dataset and Table Creation
Creating Datasets
-- Create a dataset with options
CREATE SCHEMA analytics
OPTIONS (
location = 'US',
description = 'Analytics dataset for sales data',
default_table_expiration_ms = 7776000000, -- 90 days
default_partition_expiration_ms = 7776000000
);
-- Create a dataset with access controls
CREATE SCHEMA restricted_data
OPTIONS (
location = 'US',
description = 'Restricted access dataset'
);
-- Grant access to the dataset
GRANT `roles/bigquery.dataViewer`
ON SCHEMA restricted_data
TO 'user:analyst@company.com';
Creating Tables
-- Create a basic table
CREATE TABLE analytics.sales (
sale_id INT64 NOT NULL,
product_id STRING,
customer_id STRING,
quantity INT64,
amount FLOAT64,
sale_timestamp TIMESTAMP,
sale_date DATE,
region STRING
)
OPTIONS (
description = 'Sales transaction data',
expiration_timestamp = TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 365 DAY)
);
-- Create a table from a query
CREATE TABLE analytics.daily_sales_summary
AS
SELECT
sale_date,
region,
COUNT(*) as transaction_count,
SUM(amount) as total_amount,
AVG(amount) as avg_amount
FROM analytics.sales
GROUP BY sale_date, region;
Standard SQL Features
Window Functions
-- Running total and moving averages
SELECT
sale_date,
region,
amount,
SUM(amount) OVER (
PARTITION BY region
ORDER BY sale_date
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) as running_total,
AVG(amount) OVER (
PARTITION BY region
ORDER BY sale_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) as moving_avg_7day,
LAG(amount, 1) OVER (
PARTITION BY region
ORDER BY sale_date
) as prev_day_amount
FROM analytics.sales
ORDER BY region, sale_date;
Array and Struct Functions
-- Working with nested data
CREATE TABLE analytics.customer_purchases (
customer_id STRING,
purchase_date DATE,
items ARRAY<STRUCT<
product_id STRING,
quantity INT64,
price FLOAT64
>>,
total_amount FLOAT64
);
-- Query nested data
SELECT
customer_id,
purchase_date,
item.product_id,
item.quantity,
item.price
FROM analytics.customer_purchases,
UNNEST(items) AS item
WHERE item.quantity > 1;
JSON Functions
-- Query JSON data
CREATE TABLE analytics.events (
event_id STRING,
event_timestamp TIMESTAMP,
event_data JSON
);
SELECT
event_id,
event_timestamp,
JSON_VALUE(event_data, '$.user_id') as user_id,
JSON_VALUE(event_data, '$.action') as action,
CAST(JSON_VALUE(event_data, '$.amount') AS FLOAT64) as amount
FROM analytics.events
WHERE JSON_VALUE(event_data, '$.action') = 'purchase';
Partitioning Strategies
Partitioning divides tables into segments based on a column value. This improves query performance and reduces costs.
Time-Unit Partitioning
-- Daily partitioning (default)
CREATE TABLE analytics.events_daily (
event_id STRING,
event_timestamp TIMESTAMP,
user_id STRING,
event_type STRING
)
PARTITION BY DATE(event_timestamp)
OPTIONS (
description = 'Daily partitioned events table'
);
-- Monthly partitioning
CREATE TABLE analytics.events_monthly (
event_id STRING,
event_timestamp TIMESTAMP,
user_id STRING,
event_type STRING
)
PARTITION BY DATE(event_timestamp)
OPTIONS (
partition_expiration_days = 730, -- 2 years
require_partition_filter = true
);
-- Hourly partitioning
CREATE TABLE analytics.events_hourly (
event_id STRING,
event_timestamp TIMESTAMP,
user_id STRING,
event_type STRING
)
PARTITION BY TIMESTAMP_TRUNC(event_timestamp, HOUR)
OPTIONS (
partition_expiration_days = 30
);
Integer-Range Partitioning
-- Integer range partitioning
CREATE TABLE analytics.transactions (
transaction_id INT64,
customer_id INT64,
amount FLOAT64,
transaction_date DATE
)
PARTITION BY
RANGE_BUCKET(customer_id, GENERATE_ARRAY(0, 1000000, 10000))
OPTIONS (
description = 'Partitioned by customer_id ranges'
);
-- Query with partition filter
SELECT *
FROM analytics.transactions
WHERE customer_id BETWEEN 50000 AND 60000;
Partitioning Best Practices
-- Check partition information
SELECT
table_id,
partition_id,
total_rows,
total_logical_bytes,
total_physical_bytes
FROM `analytics.events_daily.INFORMATION_SCHEMA.PARTITIONS`
WHERE partition_id IS NOT NULL
ORDER BY partition_id;
-- Force partition filter
SET OPTIONS (
analytics.events_daily.require_partition_filter = true
);
Clustering
Clustering sorts data within partitions based on column values. This improves query performance for filtered queries.
Basic Clustering
-- Single column clustering
CREATE TABLE analytics.sales_clustered (
sale_id INT64,
sale_date DATE,
region STRING,
product_category STRING,
amount FLOAT64
)
PARTITION BY sale_date
CLUSTER BY region
OPTIONS (
description = 'Sales data clustered by region'
);
-- Multi-column clustering
CREATE TABLE analytics.events_clustered (
event_id STRING,
event_timestamp TIMESTAMP,
user_id STRING,
event_type STRING,
device_type STRING,
country STRING
)
PARTITION BY DATE(event_timestamp)
CLUSTER BY event_type, device_type, country
OPTIONS (
description = 'Events clustered by type, device, and country'
);
Clustering Best Practices
-- Check clustering information
SELECT
table_id,
clustering_information
FROM `analytics.INFORMATION_SCHEMA.TABLE_OPTIONS`
WHERE option_name = 'CLUSTERING INFORMATION';
-- Recommended clustering order:
-- 1. Most frequently filtered columns
-- 2. Columns with high cardinality
-- 3. Columns used in JOIN and GROUP BY clauses
-- Example: Optimal clustering for analytics
CREATE TABLE analytics.user_events (
user_id STRING,
event_date DATE,
event_type STRING,
platform STRING,
country STRING,
event_data JSON
)
PARTITION BY event_date
CLUSTER BY user_id, event_type, platform;
Bi-Engine
Bi-Engine is an in-memory analytics layer that accelerates query performance for dashboards and reports.
Enabling Bi-Engine
-- Enable Bi-Engine for a dataset
ALTER SCHEMA analytics
SET OPTIONS (
bi_engine_options = '{"mode": "AUTO", "size_gb": 10}'
);
-- Enable Bi-Engine for a table
ALTER TABLE analytics.sales
SET OPTIONS (
bi_engine_options = '{"mode": "AUTO", "size_gb": 5}'
);
-- Reserve Bi-Engine slots
ALTER SCHEMA analytics
SET OPTIONS (
bi_engine_options = '{"mode": "RESERVED", "size_gb": 20}'
);
Bi-Engine Performance
-- Query performance comparison
-- Without Bi-Engine: ~5 seconds
-- With Bi-Engine: ~0.5 seconds
-- Monitor Bi-Engine usage
SELECT
project_id,
dataset_id,
table_id,
bi_engine_statistics
FROM `analytics.INFORMATION_SCHEMA.TABLE_STORAGE`
WHERE bi_engine_statistics IS NOT NULL;
Slot Management
Slots are BigQuery's unit of compute capacity. Managing slots is crucial for performance and cost optimization.
On-Demand Pricing
-- Default: On-demand pricing (pay per TB scanned)
-- 2000 slots per project (default)
-- Cost: $5 per TB scanned
-- Monitor slot usage
SELECT
job_id,
creation_time,
total_slot_ms,
total_bytes_processed,
total_bytes_processed / 1024 / 1024 / 1024 as tb_processed
FROM `analytics.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
WHERE creation_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1 HOUR)
ORDER BY total_slot_ms DESC;
Flat-Rate Pricing
-- Purchase slots for predictable costs
-- 100 slots: ~$2,000/month
-- 500 slots: ~$8,000/month
-- 1000 slots: ~$15,000/month
-- Create a reservation
bq mk --reservation=analytics_reservation \
--location=US \
--slots=100
-- Assign reservation to project
bq mk --assignment=analytics_reservation \
--location=US \
--project_id=my-project
Slot Monitoring
-- Monitor slot utilization
SELECT
TIMESTAMP_TRUNC(creation_time, HOUR) as hour,
COUNT(*) as job_count,
SUM(total_slot_ms) / 1000 / 3600 as slot_hours,
AVG(total_slot_ms) / 1000 as avg_slot_seconds
FROM `analytics.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
WHERE creation_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 24 HOUR)
GROUP BY hour
ORDER BY hour;
-- Find top slot consumers
SELECT
user_email,
job_type,
COUNT(*) as job_count,
SUM(total_slot_ms) / 1000 as total_slot_seconds
FROM `analytics.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
WHERE creation_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 7 DAY)
GROUP BY user_email, job_type
ORDER BY total_slot_seconds DESC
LIMIT 10;
Query Optimization
Partition and Cluster Pruning
-- Good: Uses partition filter
SELECT *
FROM analytics.events_daily
WHERE event_date = '2024-01-15';
-- Bad: Full table scan
SELECT *
FROM analytics.events_daily
WHERE EXTRACT(YEAR FROM event_timestamp) = 2024;
-- Good: Uses clustering columns
SELECT *
FROM analytics.events_clustered
WHERE event_date = '2024-01-15'
AND event_type = 'purchase';
-- Check query execution details
SELECT
job_id,
query,
total_bytes_processed,
total_slot_ms,
cache_hit
FROM `analytics.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
WHERE job_id = 'JOB_ID';
Materialized Views
-- Create materialized view for frequent queries
CREATE MATERIALIZED VIEW analytics.daily_sales_mv
AS
SELECT
sale_date,
region,
product_category,
COUNT(*) as transaction_count,
SUM(amount) as total_amount,
AVG(amount) as avg_amount
FROM analytics.sales
GROUP BY sale_date, region, product_category;
-- Refresh materialized view
ALTER MATERIALIZED VIEW analytics.daily_sales_mv
SET OPTIONS (
refresh_interval_minutes = 60
);
-- Check materialized view status
SELECT
table_id,
total_rows,
total_logical_bytes,
total_physical_bytes
FROM `analytics.INFORMATION_SCHEMA.TABLE_STORAGE`
WHERE table_id = 'daily_sales_mv';
Best Practices Summary
- Always partition large tables - Partition by date or integer range
- Use clustering - Cluster by frequently filtered columns
- Monitor slot usage - Track slot utilization and optimize queries
- Use Bi-Engine - Enable for dashboard and report queries
- Implement data governance - Use column-level security and data masking
- Optimize costs - Use partition filters and clustering to reduce bytes scanned
- Monitor query performance - Use query execution details to identify bottlenecks