πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

Gap & Island Problems

Advanced SQLProblem Solving⭐ Premium

Advertisement

Gap & Island Problems

Difficulty: Hard | Companies: Google, Amazon, Meta, Netflix, Uber

Classic Gap Problem

-- Find missing dates in a date series
WITH date_range AS (
  SELECT generate_series(
    MIN(sale_date),
    MAX(sale_date),
    '1 day'::INTERVAL
  )::DATE AS expected_date
  FROM sales
)
SELECT
  dr.expected_date AS missing_date
FROM date_range dr
LEFT JOIN sales s ON dr.expected_date = s.sale_date
WHERE s.sale_date IS NULL
ORDER BY dr.expected_date;

ℹ️

Key Insight: The gap-island pattern uses ROW_NUMBER() to identify consecutive sequences. When you subtract the row number from the value, consecutive values produce the same result, creating "islands."

Island Detection with ROW_NUMBER

-- Find consecutive days of user activity
WITH numbered AS (
  SELECT
    user_id,
    activity_date,
    activity_date - ROW_NUMBER() OVER (
      PARTITION BY user_id ORDER BY activity_date
    )::INT AS island_id
  FROM user_activity
)
SELECT
  user_id,
  MIN(activity_date) AS streak_start,
  MAX(activity_date) AS streak_end,
  COUNT(*) AS streak_length,
  MAX(activity_date) - MIN(activity_date) + 1 AS days_span
FROM numbered
GROUP BY user_id, island_id
HAVING COUNT(*) >= 3  -- At least 3 consecutive days
ORDER BY user_id, streak_start;

Gap Detection in Sequential Data

-- Find gaps in order sequences
WITH numbered_orders AS (
  SELECT
    order_id,
    order_date,
    LAG(order_date) OVER (ORDER BY order_date) AS prev_date
  FROM orders
)
SELECT
  prev_date AS gap_end,
  order_date AS gap_start,
  order_date - prev_date - 1 AS gap_days
FROM numbered_orders
WHERE order_date - prev_date > 1
ORDER BY prev_date;

Consecutive Value Islands

-- Find consecutive identical values
WITH numbered AS (
  SELECT
    product_id,
    sale_date,
    revenue,
    sale_date - ROW_NUMBER() OVER (
      PARTITION BY product_id ORDER BY sale_date
    )::INT AS island_id
  FROM daily_sales
)
SELECT
  product_id,
  MIN(sale_date) AS period_start,
  MAX(sale_date) AS period_end,
  revenue AS daily_revenue,
  COUNT(*) AS consecutive_days,
  SUM(revenue) AS total_revenue
FROM numbered
GROUP BY product_id, island_id, revenue
HAVING COUNT(*) >= 5
ORDER BY product_id, period_start;

⚠️

Common Mistake: When working with dates, remember that subtracting ROW_NUMBER from a DATE produces an INTEGER, not a DATE. Cast appropriately when using this technique.

Multi-Column Gap Detection

-- Find gaps across multiple dimensions
WITH numbered AS (
  SELECT
    department_id,
    employee_id,
    hire_date,
    hire_date - ROW_NUMBER() OVER (
      PARTITION BY department_id, employee_id
      ORDER BY hire_date
    )::INT AS island_id
  FROM employee_history
)
SELECT
  department_id,
  employee_id,
  MIN(hire_date) AS period_start,
  MAX(hire_date) AS period_end,
  COUNT(*) AS consecutive_periods
FROM numbered
GROUP BY department_id, employee_id, island_id
HAVING COUNT(*) >= 2;

Range-Based Islands

-- Group overlapping or adjacent ranges
WITH ordered_ranges AS (
  SELECT
    room_id,
    start_time,
    end_time,
    ROW_NUMBER() OVER (ORDER BY start_time) AS rn
  FROM bookings
),
merged AS (
  SELECT
    room_id,
    start_time,
    end_time,
    start_time AS group_start,
    end_time AS group_end
  FROM ordered_ranges
  WHERE rn = 1

  UNION ALL

  SELECT
    m.room_id,
    o.start_time,
    o.end_time,
    CASE
      WHEN o.start_time <= m.group_end + INTERVAL '30 minutes'
      THEN m.group_start
      ELSE o.start_time
    END,
    CASE
      WHEN o.end_time > m.group_end
      THEN o.end_time
      ELSE m.group_end
    END
  FROM merged m
  INNER JOIN ordered_ranges o
    ON o.room_id = m.room_id
    AND o.rn = (
      SELECT MIN(rn) FROM ordered_ranges
      WHERE room_id = m.room_id AND rn > (
        SELECT MAX(rn) FROM ordered_ranges
        WHERE room_id = m.room_id AND start_time <= m.group_end
      )
    )
)
SELECT
  room_id,
  MIN(group_start) AS block_start,
  MAX(group_end) AS block_end,
  MAX(group_end) - MIN(group_start) AS total_duration
FROM merged
GROUP BY room_id;

Missing Consecutive Numbers

-- Find missing consecutive IDs (like missing seats)
WITH numbered AS (
  SELECT
    seat_number,
    seat_number - ROW_NUMBER() OVER (ORDER BY seat_number) AS gap_id
  FROM occupied_seats
)
SELECT
  MIN(seat_number) + 1 AS gap_start,
  MAX(seat_number) - 1 AS gap_end,
  MAX(seat_number) - MIN(seat_number) - 1 AS gap_size
FROM numbered
GROUP BY gap_id
HAVING MAX(seat_number) - MIN(seat_number) > 1;

Temporal Gap Detection

-- Find gaps in time series data
WITH time_gaps AS (
  SELECT
    sensor_id,
    reading_time,
    LAG(reading_time) OVER (
      PARTITION BY sensor_id ORDER BY reading_time
    ) AS prev_reading,
    reading_time - LAG(reading_time) OVER (
      PARTITION BY sensor_id ORDER BY reading_time
    ) AS time_diff
  FROM sensor_readings
)
SELECT
  sensor_id,
  prev_reading AS gap_start,
  reading_time AS gap_end,
  time_diff AS gap_duration
FROM time_gaps
WHERE time_diff > INTERVAL '5 minutes'
ORDER BY sensor_id, gap_start;

Complex Gap Filling

-- Fill gaps with interpolated values
WITH RECURSIVE date_series AS (
  SELECT
    MIN(sale_date) AS date,
    MAX(sale_date) AS end_date
  FROM sales
  WHERE product_id = 1

  UNION ALL

  SELECT
    date + 1,
    end_date
  FROM date_series
  WHERE date < end_date
)
SELECT
  ds.date,
  COALESCE(s.revenue, 0) AS actual_revenue,
  -- Linear interpolation
  CASE
    WHEN s.revenue IS NULL THEN
      (SELECT AVG(revenue)
       FROM sales
       WHERE product_id = 1
         AND sale_date BETWEEN ds.date - 7 AND ds.date + 7)
    ELSE s.revenue
  END AS interpolated_revenue
FROM date_series ds
LEFT JOIN sales s ON ds.date = s.sale_date AND s.product_id = 1
ORDER BY ds.date;

Gap Analysis with Statistics

-- Analyze gap patterns
WITH gaps AS (
  SELECT
    user_id,
    activity_date,
    LAG(activity_date) OVER (
      PARTITION BY user_id ORDER BY activity_date
    ) AS prev_activity
  FROM user_activity
)
SELECT
  user_id,
  COUNT(*) AS total_gaps,
  AVG(activity_date - prev_activity) AS avg_gap_days,
  MAX(activity_date - prev_activity) AS max_gap_days,
  MIN(activity_date - prev_activity) AS min_gap_days,
  STDDEV(activity_date - prev_activity) AS stddev_gap_days
FROM gaps
WHERE prev_activity IS NOT NULL
  AND activity_date - prev_activity > 1
GROUP BY user_id
HAVING COUNT(*) > 5
ORDER BY avg_gap_days DESC;

ℹ️

Optimization Tip: For large datasets, create an index on the date column and consider using materialized views for frequently accessed gap analysis queries.

Follow-Up Questions

  1. How would you find gaps longer than a specific duration?
  2. What's the most efficient way to fill gaps in a time series?
  3. How do you handle gaps across multiple columns simultaneously?
  4. Explain how to detect overlapping ranges and merge them.
  5. How would you implement a sliding window gap detection?
  6. What's the best approach for gap analysis in partitioned tables?

Advertisement