BigQuery Deep Dive

BigQuery Architecture

BigQuery is a serverless, highly scalable, and cost-effective multi-cloud data warehouse. It separates storage and compute, allowing independent scaling.

Core Components

SQL Interface:

Parses SQL queries and creates execution plans
Supports standard SQL (ANSI 2011 compliant)
Provides query validation and optimization

Dremel Execution Engine:

Distributed query execution system
Converts queries into execution trees
Processes petabytes of data in seconds
Uses tree-architecture for parallel processing

Capacitor Storage Format:

Columnar storage format optimized for analytics
Supports automatic compression and encoding
Enables fast data scanning and filtering
Handles nested and repeated fields

Colossus Distributed Storage:

Google's distributed file system
Provides automatic replication and fault tolerance
Handles data placement and load balancing
Supports SSD and HDD storage classes

Jupiter Network Fabric:

High-bandwidth, low-latency network
Connects compute and storage resources
Provides petabits of bandwidth
Enables fast data movement between zones

Dataset and Table Creation

Creating Datasets

-- Create a dataset with options
CREATE SCHEMA analytics
OPTIONS (
  location = 'US',
  description = 'Analytics dataset for sales data',
  default_table_expiration_ms = 7776000000,  -- 90 days
  default_partition_expiration_ms = 7776000000
);

-- Create a dataset with access controls
CREATE SCHEMA restricted_data
OPTIONS (
  location = 'US',
  description = 'Restricted access dataset'
);

-- Grant access to the dataset
GRANT `roles/bigquery.dataViewer`
ON SCHEMA restricted_data
TO 'user:analyst@company.com';

Creating Tables

-- Create a basic table
CREATE TABLE analytics.sales (
  sale_id INT64 NOT NULL,
  product_id STRING,
  customer_id STRING,
  quantity INT64,
  amount FLOAT64,
  sale_timestamp TIMESTAMP,
  sale_date DATE,
  region STRING
)
OPTIONS (
  description = 'Sales transaction data',
  expiration_timestamp = TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 365 DAY)
);

-- Create a table from a query
CREATE TABLE analytics.daily_sales_summary
AS
SELECT
  sale_date,
  region,
  COUNT(*) as transaction_count,
  SUM(amount) as total_amount,
  AVG(amount) as avg_amount
FROM analytics.sales
GROUP BY sale_date, region;

Standard SQL Features

Window Functions

-- Running total and moving averages
SELECT
  sale_date,
  region,
  amount,
  SUM(amount) OVER (
    PARTITION BY region
    ORDER BY sale_date
    ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
  ) as running_total,
  AVG(amount) OVER (
    PARTITION BY region
    ORDER BY sale_date
    ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
  ) as moving_avg_7day,
  LAG(amount, 1) OVER (
    PARTITION BY region
    ORDER BY sale_date
  ) as prev_day_amount
FROM analytics.sales
ORDER BY region, sale_date;

Array and Struct Functions

-- Working with nested data
CREATE TABLE analytics.customer_purchases (
  customer_id STRING,
  purchase_date DATE,
  items ARRAY<STRUCT<
    product_id STRING,
    quantity INT64,
    price FLOAT64
  >>,
  total_amount FLOAT64
);

-- Query nested data
SELECT
  customer_id,
  purchase_date,
  item.product_id,
  item.quantity,
  item.price
FROM analytics.customer_purchases,
UNNEST(items) AS item
WHERE item.quantity > 1;

JSON Functions

-- Query JSON data
CREATE TABLE analytics.events (
  event_id STRING,
  event_timestamp TIMESTAMP,
  event_data JSON
);

SELECT
  event_id,
  event_timestamp,
  JSON_VALUE(event_data, '$.user_id') as user_id,
  JSON_VALUE(event_data, '$.action') as action,
  CAST(JSON_VALUE(event_data, '$.amount') AS FLOAT64) as amount
FROM analytics.events
WHERE JSON_VALUE(event_data, '$.action') = 'purchase';

Partitioning Strategies

Partitioning divides tables into segments based on a column value. This improves query performance and reduces costs.

Time-Unit Partitioning

-- Daily partitioning (default)
CREATE TABLE analytics.events_daily (
  event_id STRING,
  event_timestamp TIMESTAMP,
  user_id STRING,
  event_type STRING
)
PARTITION BY DATE(event_timestamp)
OPTIONS (
  description = 'Daily partitioned events table'
);

-- Monthly partitioning
CREATE TABLE analytics.events_monthly (
  event_id STRING,
  event_timestamp TIMESTAMP,
  user_id STRING,
  event_type STRING
)
PARTITION BY DATE(event_timestamp)
OPTIONS (
  partition_expiration_days = 730,  -- 2 years
  require_partition_filter = true
);

-- Hourly partitioning
CREATE TABLE analytics.events_hourly (
  event_id STRING,
  event_timestamp TIMESTAMP,
  user_id STRING,
  event_type STRING
)
PARTITION BY TIMESTAMP_TRUNC(event_timestamp, HOUR)
OPTIONS (
  partition_expiration_days = 30
);

Integer-Range Partitioning

-- Integer range partitioning
CREATE TABLE analytics.transactions (
  transaction_id INT64,
  customer_id INT64,
  amount FLOAT64,
  transaction_date DATE
)
PARTITION BY
  RANGE_BUCKET(customer_id, GENERATE_ARRAY(0, 1000000, 10000))
OPTIONS (
  description = 'Partitioned by customer_id ranges'
);

-- Query with partition filter
SELECT *
FROM analytics.transactions
WHERE customer_id BETWEEN 50000 AND 60000;

Partitioning Best Practices

-- Check partition information
SELECT
  table_id,
  partition_id,
  total_rows,
  total_logical_bytes,
  total_physical_bytes
FROM `analytics.events_daily.INFORMATION_SCHEMA.PARTITIONS`
WHERE partition_id IS NOT NULL
ORDER BY partition_id;

-- Force partition filter
SET OPTIONS (
  analytics.events_daily.require_partition_filter = true
);

Clustering

Clustering sorts data within partitions based on column values. This improves query performance for filtered queries.

Basic Clustering

-- Single column clustering
CREATE TABLE analytics.sales_clustered (
  sale_id INT64,
  sale_date DATE,
  region STRING,
  product_category STRING,
  amount FLOAT64
)
PARTITION BY sale_date
CLUSTER BY region
OPTIONS (
  description = 'Sales data clustered by region'
);

-- Multi-column clustering
CREATE TABLE analytics.events_clustered (
  event_id STRING,
  event_timestamp TIMESTAMP,
  user_id STRING,
  event_type STRING,
  device_type STRING,
  country STRING
)
PARTITION BY DATE(event_timestamp)
CLUSTER BY event_type, device_type, country
OPTIONS (
  description = 'Events clustered by type, device, and country'
);

Clustering Best Practices

-- Check clustering information
SELECT
  table_id,
  clustering_information
FROM `analytics.INFORMATION_SCHEMA.TABLE_OPTIONS`
WHERE option_name = 'CLUSTERING INFORMATION';

-- Recommended clustering order:
-- 1. Most frequently filtered columns
-- 2. Columns with high cardinality
-- 3. Columns used in JOIN and GROUP BY clauses

-- Example: Optimal clustering for analytics
CREATE TABLE analytics.user_events (
  user_id STRING,
  event_date DATE,
  event_type STRING,
  platform STRING,
  country STRING,
  event_data JSON
)
PARTITION BY event_date
CLUSTER BY user_id, event_type, platform;

Bi-Engine

Bi-Engine is an in-memory analytics layer that accelerates query performance for dashboards and reports.

Enabling Bi-Engine

-- Enable Bi-Engine for a dataset
ALTER SCHEMA analytics
SET OPTIONS (
  bi_engine_options = '{"mode": "AUTO", "size_gb": 10}'
);

-- Enable Bi-Engine for a table
ALTER TABLE analytics.sales
SET OPTIONS (
  bi_engine_options = '{"mode": "AUTO", "size_gb": 5}'
);

-- Reserve Bi-Engine slots
ALTER SCHEMA analytics
SET OPTIONS (
  bi_engine_options = '{"mode": "RESERVED", "size_gb": 20}'
);

Bi-Engine Performance

-- Query performance comparison
-- Without Bi-Engine: ~5 seconds
-- With Bi-Engine: ~0.5 seconds

-- Monitor Bi-Engine usage
SELECT
  project_id,
  dataset_id,
  table_id,
  bi_engine_statistics
FROM `analytics.INFORMATION_SCHEMA.TABLE_STORAGE`
WHERE bi_engine_statistics IS NOT NULL;

Slot Management

Slots are BigQuery's unit of compute capacity. Managing slots is crucial for performance and cost optimization.

On-Demand Pricing

-- Default: On-demand pricing (pay per TB scanned)
-- 2000 slots per project (default)
-- Cost: $5 per TB scanned

-- Monitor slot usage
SELECT
  job_id,
  creation_time,
  total_slot_ms,
  total_bytes_processed,
  total_bytes_processed / 1024 / 1024 / 1024 as tb_processed
FROM `analytics.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
WHERE creation_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1 HOUR)
ORDER BY total_slot_ms DESC;

Flat-Rate Pricing

-- Purchase slots for predictable costs
-- 100 slots: ~$2,000/month
-- 500 slots: ~$8,000/month
-- 1000 slots: ~$15,000/month

-- Create a reservation
bq mk --reservation=analytics_reservation \
  --location=US \
  --slots=100

-- Assign reservation to project
bq mk --assignment=analytics_reservation \
  --location=US \
  --project_id=my-project

Slot Monitoring

-- Monitor slot utilization
SELECT
  TIMESTAMP_TRUNC(creation_time, HOUR) as hour,
  COUNT(*) as job_count,
  SUM(total_slot_ms) / 1000 / 3600 as slot_hours,
  AVG(total_slot_ms) / 1000 as avg_slot_seconds
FROM `analytics.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
WHERE creation_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 24 HOUR)
GROUP BY hour
ORDER BY hour;

-- Find top slot consumers
SELECT
  user_email,
  job_type,
  COUNT(*) as job_count,
  SUM(total_slot_ms) / 1000 as total_slot_seconds
FROM `analytics.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
WHERE creation_time > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 7 DAY)
GROUP BY user_email, job_type
ORDER BY total_slot_seconds DESC
LIMIT 10;

Query Optimization

Partition and Cluster Pruning

-- Good: Uses partition filter
SELECT *
FROM analytics.events_daily
WHERE event_date = '2024-01-15';

-- Bad: Full table scan
SELECT *
FROM analytics.events_daily
WHERE EXTRACT(YEAR FROM event_timestamp) = 2024;

-- Good: Uses clustering columns
SELECT *
FROM analytics.events_clustered
WHERE event_date = '2024-01-15'
  AND event_type = 'purchase';

-- Check query execution details
SELECT
  job_id,
  query,
  total_bytes_processed,
  total_slot_ms,
  cache_hit
FROM `analytics.INFORMATION_SCHEMA.JOBS_BY_PROJECT`
WHERE job_id = 'JOB_ID';

Materialized Views

-- Create materialized view for frequent queries
CREATE MATERIALIZED VIEW analytics.daily_sales_mv
AS
SELECT
  sale_date,
  region,
  product_category,
  COUNT(*) as transaction_count,
  SUM(amount) as total_amount,
  AVG(amount) as avg_amount
FROM analytics.sales
GROUP BY sale_date, region, product_category;

-- Refresh materialized view
ALTER MATERIALIZED VIEW analytics.daily_sales_mv
  SET OPTIONS (
    refresh_interval_minutes = 60
  );

-- Check materialized view status
SELECT
  table_id,
  total_rows,
  total_logical_bytes,
  total_physical_bytes
FROM `analytics.INFORMATION_SCHEMA.TABLE_STORAGE`
WHERE table_id = 'daily_sales_mv';

Best Practices Summary

Always partition large tables - Partition by date or integer range
Use clustering - Cluster by frequently filtered columns
Monitor slot usage - Track slot utilization and optimize queries
Use Bi-Engine - Enable for dashboard and report queries
Implement data governance - Use column-level security and data masking
Optimize costs - Use partition filters and clustering to reduce bytes scanned
Monitor query performance - Use query execution details to identify bottlenecks

BigQuery Deep Dive - Architecture, SQL, Partitioning, and Clustering