Google Cloud Bigtable Deep Dive

Master Google Cloud Bigtable including wide-column architecture, schema design, HBase API, replication, and time-series data patterns.

18 min readAdvanced

Bigtable Architecture

Cloud Bigtable is a fully managed, scalable NoSQL database for large analytical and operational workloads. It's the same technology that powers Google Search, Maps, and Gmail.

Architecture Overview

⚡ Bigtable Architecture for Data Engineering

Interview Tip: Bigtable is ideal for time-series and IoT data with high write throughput. Use composite row keys carefully to avoid hotspots (reverse domain notation). Column families group related data and have independent GC policies. Bigtable ≠ Firestore (choose Bigtable for massive scale, Firestore for document queries).

Schema Design

Key Design Principles

⚡ Bigtable Architecture for Data Engineering

from google.cloud import bigtable
from google.cloud.bigtable import column_family
from google.cloud.bigtable.row_set import RowSet
import datetime

# Initialize Bigtable client
client = bigtable.Client(project="my-project", admin=True)
instance = client.instance("my-instance")

# Create table with column families
table_id = "sensor_data"
table = instance.table(table_id)

# Define column families with GC policies
cf_readings = column_family.MaxVersionsGCPolicy(1)  # Keep 1 version
cf_metadata = column_family.MaxVersionsGCPolicy(10)  # Keep 10 versions

table.create(column_families={
    "readings": cf_readings,
    "metadata": cf_metadata
})

print(f"Created table: {table.table_id}")

Row Key Examples

import hashlib
import time

def design_row_key(device_id, timestamp):
    """Design an efficient row key for time-series data."""
    # Option 1: Device ID + reversed timestamp
    max_ts = 9999999999999  # Max timestamp
    reversed_ts = max_ts - int(timestamp.timestamp() * 1000)
    row_key = f"{device_id}#{reversed_ts:013d}"

    return row_key

def design_row_key_with_salt(device_id, timestamp):
    """Add salt for even distribution."""
    # Hash device ID for even distribution
    hash_prefix = hashlib.md5(device_id.encode()).hexdigest()[:2]

    # Reverse timestamp for time-series ordering
    max_ts = 9999999999999
    reversed_ts = max_ts - int(timestamp.timestamp() * 1000)

    return f"{hash_prefix}#{device_id}#{reversed_ts:013d}"

# Example usage
device_id = "sensor_123"
timestamp = datetime.datetime.now()
row_key = design_row_key(device_id, timestamp)
print(f"Row key: {row_key}")  # sensor_123#9999999999999

Writing Data

from google.cloud import bigtable
from google.cloud.bigtable import column_family
from google.cloud.bigtable.row import DirectRow
import datetime

def write_sensor_data(instance, table_id, device_id, readings):
    """Write sensor data to Bigtable."""
    table = instance.table(table_id)

    # Create row with timestamp-based key
    timestamp = datetime.datetime.now()
    max_ts = 9999999999999
    reversed_ts = max_ts - int(timestamp.timestamp() * 1000)
    row_key = f"{device_id}#{reversed_ts:013d}"

    # Create row
    row = table.direct_row(row_key)

    # Add readings
    row.set_cell(
        "readings",
        "temperature".encode(),
        str(readings["temperature"]).encode(),
        timestamp=timestamp
    )
    row.set_cell(
        "readings",
        "humidity".encode(),
        str(readings["humidity"]).encode(),
        timestamp=timestamp
    )
    row.set_cell(
        "readings",
        "pressure".encode(),
        str(readings["pressure"]).encode(),
        timestamp=timestamp
    )

    # Add metadata
    row.set_cell(
        "metadata",
        "location".encode(),
        readings.get("location", "unknown").encode(),
        timestamp=timestamp
    )

    # Commit the row
    row.commit()
    print(f"Written row: {row_key}")

# Example usage
readings = {
    "temperature": 23.5,
    "humidity": 65.2,
    "pressure": 1013.25,
    "location": "warehouse_a"
}
write_sensor_data(instance, "sensor_data", "sensor_123", readings)

Reading Data

def read_time_range(instance, table_id, device_id, start_time, end_time):
    """Read data for a device within a time range."""
    table = instance.table(table_id)

    # Create row range based on reversed timestamps
    max_ts = 9999999999999
    start_reversed = max_ts - int(end_time.timestamp() * 1000)
    end_reversed = max_ts - int(start_time.timestamp() * 1000)

    row_range = bigtable.row_range.RowRange(
        start_key=f"{device_id}#{start_reversed:013d}".encode(),
        end_key=f"{device_id}#{end_reversed:013d}".encode()
    )

    # Read rows
    rows = table.read_rows(row_range=row_range)

    results = []
    for row in rows:
        readings = {
            "row_key": row.row_key.decode(),
            "temperature": float(row.cells["readings"][b"temperature"][0].value.decode()),
            "humidity": float(row.cells["readings"][b"humidity"][0].value.decode()),
            "pressure": float(row.cells["readings"][b"pressure"][0].value.decode()),
        }
        results.append(readings)

    return results

Garbage Collection Policies

from google.cloud.bigtable import column_family

# Policy 1: Keep last N versions
cf_versions = column_family.MaxVersionsGCPolicy(5)

# Policy 2: Keep data for N days
cf_days = column_family.MaxAgeGCPolicy(datetime.timedelta(days=30))

# Policy 3: Combined policy
cf_combined = column_family.GCRules.union(
    column_family.MaxVersionsGCPolicy(3),
    column_family.MaxAgeGCPolicy(datetime.timedelta(days=7))
)

# Policy 4: Per-column-family GC
table.create(column_families={
    "readings": column_family.MaxVersionsGCPolicy(1),  # Latest only
    "metadata": column_family.MaxAgeGCPolicy(datetime.timedelta(days=365)),
    "logs": column_family.MaxAgeGCPolicy(datetime.timedelta(days=30))
})

✨

Best Practice: Use MaxVersionsGCPolicy(1) for sensor readings (keep latest), MaxAgeGCPolicy for historical data retention, and combined policies for complex requirements. GC policies are critical for controlling storage costs in Bigtable.

💬

Common Interview Questions

Q1: What is the difference between Bigtable and Firestore?

Answer: Bigtable is a wide-column store optimized for high-throughput time-series and analytical workloads (IoT, metrics, logs). Firestore is a document database for structured data with complex queries. Bigtable has higher throughput (millions of ops/sec) but limited query capabilities. Firestore supports richer queries but has lower throughput limits.

Q2: How do you design row keys for time-series data?

Answer: Use composite keys: {entity_id}#{reversed_timestamp}. Reversing the timestamp ensures newest data is read first. Add a hash prefix for even distribution across nodes. Avoid monotonically increasing keys as they create hotspots. Consider using device_id or user_id as the entity prefix.

Q3: What is the purpose of column families?

Answer: Column families group columns by access pattern and enable garbage collection policies. They determine data locality and storage efficiency. Keep column families to <100 per table. Use different GC policies per family: MaxVersions for sensor data, MaxAge for historical data.

Q4: How does Bigtable replication work?

Answer: Bigtable automatically replicates data across clusters for high availability. You can configure cluster storage type (SSD/HDD) per cluster. Replication is eventually consistent (typically <10 seconds). Use multi-cluster routing for automatic failover. Replication is critical for disaster recovery.

Q5: How do you optimize Bigtable for cost?

Answer: 1) Use SSD for hot data, HDD for cold data, 2) Configure aggressive GC policies, 3) Use replication only for availability, not performance, 4) Right-size nodes based on throughput, 5) Use batch operations for bulk writes, 6) Monitor and optimize row key design to prevent hotspots.

Bigtable: Wide-Column Store for Time-Series Data