Bigtable Architecture
Cloud Bigtable is a fully managed, scalable NoSQL database for large analytical and operational workloads. It's the same technology that powers Google Search, Maps, and Gmail.
Architecture Overview
Schema Design
Key Design Principles
from google.cloud import bigtable
from google.cloud.bigtable import column_family
from google.cloud.bigtable.row_set import RowSet
import datetime
# Initialize Bigtable client
client = bigtable.Client(project="my-project", admin=True)
instance = client.instance("my-instance")
# Create table with column families
table_id = "sensor_data"
table = instance.table(table_id)
# Define column families with GC policies
cf_readings = column_family.MaxVersionsGCPolicy(1) # Keep 1 version
cf_metadata = column_family.MaxVersionsGCPolicy(10) # Keep 10 versions
table.create(column_families={
"readings": cf_readings,
"metadata": cf_metadata
})
print(f"Created table: {table.table_id}")
Row Key Examples
import hashlib
import time
def design_row_key(device_id, timestamp):
"""Design an efficient row key for time-series data."""
# Option 1: Device ID + reversed timestamp
max_ts = 9999999999999 # Max timestamp
reversed_ts = max_ts - int(timestamp.timestamp() * 1000)
row_key = f"{device_id}#{reversed_ts:013d}"
return row_key
def design_row_key_with_salt(device_id, timestamp):
"""Add salt for even distribution."""
# Hash device ID for even distribution
hash_prefix = hashlib.md5(device_id.encode()).hexdigest()[:2]
# Reverse timestamp for time-series ordering
max_ts = 9999999999999
reversed_ts = max_ts - int(timestamp.timestamp() * 1000)
return f"{hash_prefix}#{device_id}#{reversed_ts:013d}"
# Example usage
device_id = "sensor_123"
timestamp = datetime.datetime.now()
row_key = design_row_key(device_id, timestamp)
print(f"Row key: {row_key}") # sensor_123#9999999999999
Writing Data
from google.cloud import bigtable
from google.cloud.bigtable import column_family
from google.cloud.bigtable.row import DirectRow
import datetime
def write_sensor_data(instance, table_id, device_id, readings):
"""Write sensor data to Bigtable."""
table = instance.table(table_id)
# Create row with timestamp-based key
timestamp = datetime.datetime.now()
max_ts = 9999999999999
reversed_ts = max_ts - int(timestamp.timestamp() * 1000)
row_key = f"{device_id}#{reversed_ts:013d}"
# Create row
row = table.direct_row(row_key)
# Add readings
row.set_cell(
"readings",
"temperature".encode(),
str(readings["temperature"]).encode(),
timestamp=timestamp
)
row.set_cell(
"readings",
"humidity".encode(),
str(readings["humidity"]).encode(),
timestamp=timestamp
)
row.set_cell(
"readings",
"pressure".encode(),
str(readings["pressure"]).encode(),
timestamp=timestamp
)
# Add metadata
row.set_cell(
"metadata",
"location".encode(),
readings.get("location", "unknown").encode(),
timestamp=timestamp
)
# Commit the row
row.commit()
print(f"Written row: {row_key}")
# Example usage
readings = {
"temperature": 23.5,
"humidity": 65.2,
"pressure": 1013.25,
"location": "warehouse_a"
}
write_sensor_data(instance, "sensor_data", "sensor_123", readings)
Reading Data
def read_time_range(instance, table_id, device_id, start_time, end_time):
"""Read data for a device within a time range."""
table = instance.table(table_id)
# Create row range based on reversed timestamps
max_ts = 9999999999999
start_reversed = max_ts - int(end_time.timestamp() * 1000)
end_reversed = max_ts - int(start_time.timestamp() * 1000)
row_range = bigtable.row_range.RowRange(
start_key=f"{device_id}#{start_reversed:013d}".encode(),
end_key=f"{device_id}#{end_reversed:013d}".encode()
)
# Read rows
rows = table.read_rows(row_range=row_range)
results = []
for row in rows:
readings = {
"row_key": row.row_key.decode(),
"temperature": float(row.cells["readings"][b"temperature"][0].value.decode()),
"humidity": float(row.cells["readings"][b"humidity"][0].value.decode()),
"pressure": float(row.cells["readings"][b"pressure"][0].value.decode()),
}
results.append(readings)
return results
Garbage Collection Policies
from google.cloud.bigtable import column_family
# Policy 1: Keep last N versions
cf_versions = column_family.MaxVersionsGCPolicy(5)
# Policy 2: Keep data for N days
cf_days = column_family.MaxAgeGCPolicy(datetime.timedelta(days=30))
# Policy 3: Combined policy
cf_combined = column_family.GCRules.union(
column_family.MaxVersionsGCPolicy(3),
column_family.MaxAgeGCPolicy(datetime.timedelta(days=7))
)
# Policy 4: Per-column-family GC
table.create(column_families={
"readings": column_family.MaxVersionsGCPolicy(1), # Latest only
"metadata": column_family.MaxAgeGCPolicy(datetime.timedelta(days=365)),
"logs": column_family.MaxAgeGCPolicy(datetime.timedelta(days=30))
})
β¨
Best Practice: Use MaxVersionsGCPolicy(1) for sensor readings (keep latest), MaxAgeGCPolicy for historical data retention, and combined policies for complex requirements. GC policies are critical for controlling storage costs in Bigtable.
Common Interview Questions
Q1: What is the difference between Bigtable and Firestore?
Answer: Bigtable is a wide-column store optimized for high-throughput time-series and analytical workloads (IoT, metrics, logs). Firestore is a document database for structured data with complex queries. Bigtable has higher throughput (millions of ops/sec) but limited query capabilities. Firestore supports richer queries but has lower throughput limits.
Q2: How do you design row keys for time-series data?
Answer: Use composite keys: {entity_id}#{reversed_timestamp}. Reversing the timestamp ensures newest data is read first. Add a hash prefix for even distribution across nodes. Avoid monotonically increasing keys as they create hotspots. Consider using device_id or user_id as the entity prefix.
Q3: What is the purpose of column families?
Answer: Column families group columns by access pattern and enable garbage collection policies. They determine data locality and storage efficiency. Keep column families to <100 per table. Use different GC policies per family: MaxVersions for sensor data, MaxAge for historical data.
Q4: How does Bigtable replication work?
Answer: Bigtable automatically replicates data across clusters for high availability. You can configure cluster storage type (SSD/HDD) per cluster. Replication is eventually consistent (typically <10 seconds). Use multi-cluster routing for automatic failover. Replication is critical for disaster recovery.
Q5: How do you optimize Bigtable for cost?
Answer: 1) Use SSD for hot data, HDD for cold data, 2) Configure aggressive GC policies, 3) Use replication only for availability, not performance, 4) Right-size nodes based on throughput, 5) Use batch operations for bulk writes, 6) Monitor and optimize row key design to prevent hotspots.