Interview Question: "Explain HyperLogLog and its error bounds. When would you use approximate queries vs exact queries? How does Count-Min Sketch work?" — Asked at Google, Facebook, Amazon for Data Infrastructure roles
ℹ️
Difficulty: Advanced | Companies: Google, Facebook, Amazon, Twitter, LinkedIn | Time: 60-75 minutes
HyperLogLog for Distinct Counting
HyperLogLog estimates cardinality with memory:
Where is the number of registers.
-- PostgreSQL doesn't have built-in HyperLogLog
-- Implement with extension or custom function
-- Create HLL extension (if available)
CREATE EXTENSION IF NOT EXISTS hyperloglog;
-- Create approximate distinct count function
CREATE OR REPLACE FUNCTION hll_add(hll_state BYTEA, value TEXT)
RETURNS BYTEA AS $$
DECLARE
hash_val BIGINT;
register_idx INT;
leading_zeros INT;
current_val INT;
BEGIN
-- Simple hash function
hash_val := hashtext(value);
register_idx := (hash_val & 1023) + 1; -- 1024 registers
-- Count leading zeros
leading_zeros := 0;
current_val := (hash_val >> 10) & 1023;
WHILE (current_val & 1) = 0 AND leading_zeros < 32 LOOP
leading_zeros := leading_zeros + 1;
current_val := current_val >> 1;
END LOOP;
-- Update register if larger
IF hll_state IS NULL THEN
hll_state := repeat('\0', 1024)::BYTEA;
END IF;
IF get_byte(hll_state, register_idx) < leading_zeros THEN
hll_state := set_byte(hll_state, register_idx, leading_zeros);
END IF;
RETURN hll_state;
END;
$$ LANGUAGE plpgsql;
-- Usage example
WITH hll_state AS (
SELECT
user_id,
hll_add(NULL, event_type) AS state
FROM events
GROUP BY user_id
)
SELECT
COUNT(DISTINCT user_id) AS exact_count,
-- Approximate count would use HLL state
COUNT(DISTINCT user_id) AS approximate_count
FROM events;
Count-Min Sketch
Count-Min Sketch estimates frequency with update:
-- Create Count-Min Sketch structure
CREATE TABLE count_min_sketch (
depth INT DEFAULT 4,
width INT DEFAULT 1024,
table_idx INT,
counter_idx INT,
counter BIGINT DEFAULT 0,
PRIMARY KEY (table_idx, counter_idx)
);
-- Initialize sketch
INSERT INTO count_min_sketch (table_idx, counter_idx, counter)
SELECT
d.i AS table_idx,
w.i AS counter_idx,
0 AS counter
FROM generate_series(1, 4) d(i),
generate_series(0, 1023) w(i);
-- Update function
CREATE OR REPLACE FUNCTION cms_update(item TEXT)
RETURNS VOID AS $$
DECLARE
hash_val BIGINT;
BEGIN
FOR i IN 1..4 LOOP
hash_val := hashtext(item || i::TEXT);
UPDATE count_min_sketch
SET counter = counter + 1
WHERE table_idx = i
AND counter_idx = (hash_val & 1023);
END LOOP;
END;
$$ LANGUAGE plpgsql;
-- Query function
CREATE OR REPLACE FUNCTION cms_estimate(item TEXT)
RETURNS BIGINT AS $$
DECLARE
min_val BIGINT := BIGINT_MAX;
hash_val BIGINT;
current_val BIGINT;
BEGIN
FOR i IN 1..4 LOOP
hash_val := hashtext(item || i::TEXT);
SELECT counter INTO current_val
FROM count_min_sketch
WHERE table_idx = i
AND counter_idx = (hash_val & 1023);
IF current_val < min_val THEN
min_val := current_val;
END IF;
END LOOP;
RETURN min_val;
END;
$$ LANGUAGE plpgsql;
Top-K Approximation
-- Space-Saving algorithm for Top-K
CREATE TABLE top_k_estimator (
item TEXT,
count BIGINT,
error BIGINT,
PRIMARY KEY (item)
);
CREATE OR REPLACE FUNCTION top_k_update(new_item TEXT, k INT)
RETURNS VOID AS $$
DECLARE
existing_count BIGINT;
min_count BIGINT;
min_item TEXT;
BEGIN
-- Check if item exists
SELECT count INTO existing_count
FROM top_k_estimator
WHERE item = new_item;
IF FOUND THEN
UPDATE top_k_estimator
SET count = count + 1
WHERE item = new_item;
ELSE
-- Check if we have room
IF (SELECT COUNT(*) FROM top_k_estimator) < k THEN
INSERT INTO top_k_estimator (item, count, error)
VALUES (new_item, 1, 0);
ELSE
-- Find minimum
SELECT item, count INTO min_item, min_count
FROM top_k_estimator
ORDER BY count ASC
LIMIT 1;
-- Replace if new item has higher count
IF 1 > min_count THEN
DELETE FROM top_k_estimator WHERE item = min_item;
INSERT INTO top_k_estimator (item, count, error)
VALUES (new_item, min_count + 1, min_count);
END IF;
END IF;
END IF;
END;
$$ LANGUAGE plpgsql;
-- Get Top-K items
SELECT
item,
count,
error,
count - error AS estimated_count
FROM top_k_estimator
ORDER BY count DESC
LIMIT 10;
Bloom Filter
-- Bloom filter for membership testing
CREATE TABLE bloom_filter (
filter_size INT DEFAULT 1024,
hash_functions INT DEFAULT 3,
bits BIT(1024) DEFAULT repeat('0', 1024)::BIT(1024)
);
CREATE OR REPLACE FUNCTION bloom_add(item TEXT)
RETURNS VOID AS $$
DECLARE
hash_val BIGINT;
pos INT;
BEGIN
FOR i IN 1..3 LOOP
hash_val := hashtext(item || i::TEXT);
pos := (hash_val & 1023) + 1;
UPDATE bloom_filter
SET bits = set_bit(bits, pos, 1);
END LOOP;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE FUNCTION bloom_check(item TEXT)
RETURNS BOOLEAN AS $$
DECLARE
hash_val BIGINT;
pos INT;
result BOOLEAN := TRUE;
BEGIN
FOR i IN 1..3 LOOP
hash_val := hashtext(item || i::TEXT);
pos := (hash_val & 1023) + 1;
IF get_bit((SELECT bits FROM bloom_filter), pos) = 0 THEN
result := FALSE;
EXIT;
END IF;
END LOOP;
RETURN result;
END;
$$ LANGUAGE plpgsql;
Approximate Percentile
-- T-Digest for approximate percentiles
CREATE TABLE t_digest (
centroid_id SERIAL PRIMARY KEY,
mean DECIMAL(10,6),
count INT,
min_val DECIMAL(10,6),
max_val DECIMAL(10,6)
);
-- Q-Digest for quantile queries
CREATE TABLE q_digest (
node_id INT PRIMARY KEY,
count INT,
left_child INT,
right_child INT
);
Error Bounds
For HyperLogLog:
For Count-Min Sketch:
Where:
- = number of registers
- = error parameter
- = failure probability
- = total count
-- Calculate error bounds
WITH params AS (
SELECT
1024 AS m, -- HyperLogLog registers
0.01 AS epsilon, -- Count-Min error
0.001 AS delta -- Failure probability
)
SELECT
1.04 / SQRT(m) AS hll_error,
1.0 / (4.0 ^ m) AS hll_failure_prob,
epsilon AS cm_error_bound,
delta AS cm_failure_prob
FROM params;
Error Comparison:
| Algorithm | Memory | Error | Use Case |
|---|---|---|---|
| HyperLogLog | Distinct count | ||
| Count-Min | Frequency | ||
| Bloom Filter | Membership | ||
| Top-K | Variable | Frequent items |
ℹ️
When to Use: Approximate queries are ideal for real-time dashboards, monitoring, and big data analytics where exact answers aren't critical.
PostgreSQL Built-in Approximate Functions
-- Use pg_trgm for approximate matching
CREATE EXTENSION IF NOT EXISTS pg_trgm;
SELECT
word,
similarity(word, 'postgresql') AS sim
FROM pg_words
ORDER BY sim DESC
LIMIT 10;
-- Approximate string matching with Levenshtein
CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
SELECT
word,
levenshtein(word, 'postgresql') AS distance
FROM pg_words
ORDER BY distance
LIMIT 10;
Performance Comparison
| Operation | Exact | Approximate | Speedup |
|---|---|---|---|
| Distinct Count | 10-100x | ||
| Top-K | 5-50x | ||
| Percentile | 10-100x | ||
| Membership | 100-1000x |
⚠️
Accuracy Trade-off: Always document the error bounds. Use exact queries for financial or regulatory reporting.