ποΈ Gaps and Islands
Google & Netflix Interview Deep Dive
π Interview Question
βΉοΈπ΄ Google/Netflix Interview Question
"Given a server logs table with login timestamps, find: 1) Consecutive login streaks per user, 2) The longest gap between logins, 3) Users who logged in every day for the past 30 days, 4) Periods of consecutive server downtime."
Companies: Google, Netflix | Difficulty: Hard | Time: 40 minutes
π Setup: Server Logs
CREATE TABLE user_logins (
login_id SERIAL PRIMARY KEY,
user_id INT,
login_date DATE,
login_timestamp TIMESTAMP
);
-- Insert sample data with gaps
INSERT INTO user_logins (user_id, login_date, login_timestamp) VALUES
-- User 1: Streak of 5 days, gap, streak of 3 days
(1, '2024-01-01', '2024-01-01 09:00:00'),
(1, '2024-01-02', '2024-01-02 08:30:00'),
(1, '2024-01-03', '2024-01-03 09:15:00'),
(1, '2024-01-04', '2024-01-04 08:45:00'),
(1, '2024-01-05', '2024-01-05 09:00:00'),
(1, '2024-01-08', '2024-01-08 09:00:00'), -- Gap: Jan 6-7
(1, '2024-01-09', '2024-01-09 08:30:00'),
(1, '2024-01-10', '2024-01-10 09:00:00'),
-- User 2: Irregular pattern
(2, '2024-01-01', '2024-01-01 10:00:00'),
(2, '2024-01-03', '2024-01-03 11:00:00'),
(2, '2024-01-05', '2024-01-05 10:30:00'),
(2, '2024-01-06', '2024-01-06 10:00:00'),
(2, '2024-01-07', '2024-01-07 10:00:00'),
(2, '2024-01-10', '2024-01-10 10:00:00'),
-- User 3: Perfect streak
(3, '2024-01-01', '2024-01-01 08:00:00'),
(3, '2024-01-02', '2024-01-02 08:00:00'),
(3, '2024-01-03', '2024-01-03 08:00:00'),
(3, '2024-01-04', '2024-01-04 08:00:00'),
(3, '2024-01-05', '2024-01-05 08:00:00'),
(3, '2024-01-06', '2024-01-06 08:00:00'),
(3, '2024-01-07', '2024-01-07 08:00:00'),
(3, '2024-01-08', '2024-01-08 08:00:00'),
(3, '2024-01-09', '2024-01-09 08:00:00'),
(3, '2024-01-10', '2024-01-10 08:00:00');
ποΈ Part 1: Finding Consecutive Sequences (Islands)
βΉοΈπ Gaps and Islands Pattern
The gaps and islands problem finds consecutive sequences in data. The key insight: subtract a row_number from the date/value to create groups of consecutive values.
Consecutive Login Streaks
-- Find consecutive login streaks per user
WITH numbered_logins AS (
SELECT
user_id,
login_date,
ROW_NUMBER() OVER (
PARTITION BY user_id
ORDER BY login_date
) AS rn
FROM (
SELECT DISTINCT user_id, login_date
FROM user_logins
) distinct_logins
),
streak_groups AS (
SELECT
user_id,
login_date,
-- Key trick: date - row_number = constant for consecutive dates
login_date - rn * INTERVAL '1 day' AS streak_group
FROM numbered_logins
)
SELECT
user_id,
MIN(login_date) AS streak_start,
MAX(login_date) AS streak_end,
COUNT(*) AS streak_length,
MAX(login_date) - MIN(login_date) + 1 AS days_spanned
FROM streak_groups
GROUP BY user_id, streak_group
ORDER BY user_id, streak_start;
Output:
| user_id | streak_start | streak_end | streak_length | days_spanned |
|---|---|---|---|---|
| 1 | 2024-01-01 | 2024-01-05 | 5 | 5 |
| 1 | 2024-01-08 | 2024-01-10 | 3 | 3 |
| 2 | 2024-01-01 | 2024-01-01 | 1 | 1 |
| 2 | 2024-01-03 | 2024-01-03 | 1 | 1 |
| 2 | 2024-01-05 | 2024-01-07 | 3 | 3 |
| 2 | 2024-01-10 | 2024-01-10 | 1 | 1 |
| 3 | 2024-01-01 | 2024-01-10 | 10 | 10 |
Longest Streak Per User
-- Find the longest consecutive login streak per user
WITH numbered_logins AS (
SELECT
user_id,
login_date,
ROW_NUMBER() OVER (
PARTITION BY user_id
ORDER BY login_date
) AS rn
FROM (
SELECT DISTINCT user_id, login_date
FROM user_logins
) distinct_logins
),
streak_groups AS (
SELECT
user_id,
login_date,
login_date - rn * INTERVAL '1 day' AS streak_group
FROM numbered_logins
),
streak_lengths AS (
SELECT
user_id,
streak_group,
MIN(login_date) AS streak_start,
MAX(login_date) AS streak_end,
COUNT(*) AS streak_length,
ROW_NUMBER() OVER (
PARTITION BY user_id
ORDER BY COUNT(*) DESC
) AS streak_rank
FROM streak_groups
GROUP BY user_id, streak_group
)
SELECT
user_id,
streak_start,
streak_end,
streak_length
FROM streak_lengths
WHERE streak_rank = 1
ORDER BY streak_length DESC;
π Part 2: Finding Gaps Between Events
-- Calculate gaps between consecutive logins
WITH login_gaps AS (
SELECT
user_id,
login_date,
LAG(login_date) OVER (
PARTITION BY user_id
ORDER BY login_date
) AS prev_login_date,
login_date - LAG(login_date) OVER (
PARTITION BY user_id
ORDER BY login_date
) AS days_since_last_login
FROM (
SELECT DISTINCT user_id, login_date
FROM user_logins
) distinct_logins
)
SELECT
user_id,
login_date,
prev_login_date,
days_since_last_login,
CASE
WHEN days_since_last_login = 1 THEN 'Consecutive'
WHEN days_since_last_login = 2 THEN '1 Day Gap'
WHEN days_since_last_login <= 7 THEN 'Short Gap'
ELSE 'Long Gap'
END AS gap_category
FROM login_gaps
WHERE prev_login_date IS NOT NULL
ORDER BY user_id, login_date;
Longest Gap Per User
-- Find the longest gap between logins
WITH login_gaps AS (
SELECT
user_id,
login_date,
login_date - LAG(login_date) OVER (
PARTITION BY user_id ORDER BY login_date
) AS gap_days
FROM (
SELECT DISTINCT user_id, login_date FROM user_logins
) dl
),
ranked_gaps AS (
SELECT
user_id,
gap_days,
LAG(login_date) OVER (
PARTITION BY user_id ORDER BY login_date
) AS gap_start,
login_date AS gap_end,
ROW_NUMBER() OVER (
PARTITION BY user_id
ORDER BY gap_days DESC NULLS LAST
) AS gap_rank
FROM login_gaps
)
SELECT
user_id,
gap_start,
gap_end,
gap_days
FROM ranked_gaps
WHERE gap_rank = 1
ORDER BY gap_days DESC;
π Part 3: Daily Consecutive Login Requirement
-- Find users who logged in every day for the past 30 days
WITH date_range AS (
SELECT generate_series(
CURRENT_DATE - INTERVAL '30 days',
CURRENT_DATE,
'1 day'::interval
)::date AS date
),
user_days AS (
SELECT DISTINCT user_id, login_date
FROM user_logins
WHERE login_date >= CURRENT_DATE - INTERVAL '30 days'
),
expected_days AS (
SELECT
u.user_id,
d.date AS expected_date
FROM (SELECT DISTINCT user_id FROM user_logins) u
CROSS JOIN date_range d
),
missing_days AS (
SELECT
ed.user_id,
ed.expected_date
FROM expected_days ed
LEFT JOIN user_days ud
ON ed.user_id = ud.user_id
AND ed.expected_date = ud.login_date
WHERE ud.user_id IS NULL
)
SELECT
user_id,
COUNT(*) AS missing_days,
CASE
WHEN COUNT(*) = 0 THEN 'Perfect Streak'
WHEN COUNT(*) <= 2 THEN 'Near Perfect'
ELSE 'Broken Streak'
END AS streak_status
FROM missing_days
GROUP BY user_id
HAVING COUNT(*) = 0 -- Only perfect streaks
ORDER BY user_id;
π Part 4: Running Totals and Cumulative Calculations
-- Running total of logins per user
WITH daily_logins AS (
SELECT
user_id,
login_date,
COUNT(*) AS login_count
FROM user_logins
GROUP BY user_id, login_date
)
SELECT
user_id,
login_date,
login_count,
SUM(login_count) OVER (
PARTITION BY user_id
ORDER BY login_date
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) AS running_total,
AVG(login_count) OVER (
PARTITION BY user_id
ORDER BY login_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) AS rolling_7day_avg
FROM daily_logins
ORDER BY user_id, login_date;
Cumulative Distribution
-- Calculate percentile rank of login frequency
WITH user_frequency AS (
SELECT
user_id,
COUNT(DISTINCT login_date) AS total_logins
FROM user_logins
GROUP BY user_id
)
SELECT
user_id,
total_logins,
PERCENT_RANK() OVER (ORDER BY total_logins) AS percentile_rank,
NTILE(10) OVER (ORDER BY total_logins) AS frequency_decile,
CASE
WHEN PERCENT_RANK() OVER (ORDER BY total_logins) >= 0.9 THEN 'Power User'
WHEN PERCENT_RANK() OVER (ORDER BY total_logins) >= 0.75 THEN 'Frequent'
WHEN PERCENT_RANK() OVER (ORDER BY total_logins) >= 0.5 THEN 'Regular'
ELSE 'Occasional'
END AS user_tier
FROM user_frequency
ORDER BY total_logins DESC;
π Part 5: Server Downtime Analysis
-- Create server status table
CREATE TABLE server_status (
check_id SERIAL PRIMARY KEY,
server_id INT,
check_time TIMESTAMP,
status VARCHAR(20) -- 'up', 'down', 'degraded'
);
-- Insert status checks
INSERT INTO server_status (server_id, check_time, status) VALUES
(1, '2024-01-01 00:00:00', 'up'),
(1, '2024-01-01 01:00:00', 'up'),
(1, '2024-01-01 02:00:00', 'down'),
(1, '2024-01-01 03:00:00', 'down'),
(1, '2024-01-01 04:00:00', 'down'),
(1, '2024-01-01 05:00:00', 'up'),
(1, '2024-01-01 06:00:00', 'up'),
(1, '2024-01-01 07:00:00', 'degraded'),
(1, '2024-01-01 08:00:00', 'down'),
(1, '2024-01-01 09:00:00', 'down'),
(1, '2024-01-01 10:00:00', 'up');
-- Find consecutive downtime periods
WITH numbered_checks AS (
SELECT
server_id,
check_time,
status,
ROW_NUMBER() OVER (
PARTITION BY server_id, status
ORDER BY check_time
) AS rn
FROM server_status
),
status_groups AS (
SELECT
server_id,
check_time,
status,
check_time - rn * INTERVAL '1 hour' AS status_group
FROM numbered_checks
)
SELECT
server_id,
status,
MIN(check_time) AS period_start,
MAX(check_time) AS period_end,
COUNT(*) AS consecutive_checks,
MAX(check_time) - MIN(check_time) AS duration
FROM status_groups
WHERE status IN ('down', 'degraded')
GROUP BY server_id, status, status_group
ORDER BY server_id, period_start;
π Part 6: Advanced Patterns
Identifying Missing Dates
-- Find missing dates in a sequence
WITH date_series AS (
SELECT generate_series(
MIN(login_date),
MAX(login_date),
'1 day'::interval
)::date AS expected_date
FROM user_logins
WHERE user_id = 1
),
actual_dates AS (
SELECT DISTINCT login_date
FROM user_logins
WHERE user_id = 1
)
SELECT
ds.expected_date,
ds.expected_date - LAG(ds.expected_date) OVER (ORDER BY ds.expected_date) AS gap_days
FROM date_series ds
LEFT JOIN actual_dates ad ON ds.expected_date = ad.login_date
WHERE ad.login_date IS NULL
ORDER BY ds.expected_date;
Consecutive Events Detection
-- Detect 3+ consecutive logins (anomaly detection)
WITH numbered_logins AS (
SELECT
user_id,
login_date,
ROW_NUMBER() OVER (
PARTITION BY user_id ORDER BY login_date
) AS rn
FROM (SELECT DISTINCT user_id, login_date FROM user_logins) dl
),
streak_groups AS (
SELECT
user_id,
login_date,
login_date - rn * INTERVAL '1 day' AS streak_group
FROM numbered_logins
),
streak_info AS (
SELECT
user_id,
streak_group,
MIN(login_date) AS streak_start,
MAX(login_date) AS streak_end,
COUNT(*) AS streak_length
FROM streak_groups
GROUP BY user_id, streak_group
)
SELECT
user_id,
streak_start,
streak_end,
streak_length,
CASE
WHEN streak_length >= 7 THEN 'Very Long Streak'
WHEN streak_length >= 3 THEN 'Long Streak'
ELSE 'Normal'
END AS streak_category
FROM streak_info
WHERE streak_length >= 3
ORDER BY streak_length DESC;
π― Quiz Section
π Best Practices for Interviews
π‘β Gaps and Islands Best Practices
1. Handle Date Granularity:
-- For timestamps, truncate to appropriate granularity
DATE_TRUNC('day', login_timestamp) AS login_date
-- For time gaps, consider appropriate intervals
login_timestamp - LAG(login_timestamp) OVER (...) AS time_gap
2. Consider Edge Cases:
-- Handle NULLs
COALESCE(prev_date, current_date) AS adjusted_prev_date
-- Handle single-row groups
CASE WHEN COUNT(*) OVER (...) = 1 THEN 'Single Event' ELSE 'Group' END
3. Use CTEs for Clarity:
-- Break into logical steps
WITH
numbered AS (...), -- Add row numbers
grouped AS (...), -- Create groups
aggregated AS (...) -- Calculate results
SELECT * FROM aggregated;
4. Verify with Sample Data:
-- Always test with known patterns
INSERT INTO test_data VALUES (1, '2024-01-01');
INSERT INTO test_data VALUES (1, '2024-01-02');
INSERT INTO test_data VALUES (1, '2024-01-03');
-- Expect: 1 streak of 3 days
5. Consider Performance:
-- Index columns used in PARTITION BY and ORDER BY
CREATE INDEX idx_logins_user_date ON user_logins(user_id, login_date);
β οΈβ οΈ Common Pitfalls
- Duplicate values: Ensure DISTINCT before applying row_number
- Date arithmetic: Use appropriate interval types
- Time zones: Be consistent with date/timestamp handling
- Empty results: Handle cases with no consecutive values