πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

Advanced Analytic Functions

Advanced SQLAnalytics⭐ Premium

Advertisement

Advanced Analytic Functions

Difficulty: Hard | Companies: Google, Amazon, Meta, Netflix, Uber

Statistical Aggregate Functions

-- Advanced statistical calculations
SELECT
  department_id,
  COUNT(*) AS n,
  AVG(salary) AS mean,
  STDDEV_POP(salary) AS std_dev_population,
  STDDEV_SAMP(salary) AS std_dev_sample,
  VAR_POP(salary) AS variance_population,
  VAR_SAMP(salary) AS variance_sample,
  CORR(salary, years_experience) AS correlation,
  REGR_SLOPE(salary, years_experience) AS regression_slope,
  REGR_INTERCEPT(salary, years_experience) AS regression_intercept,
  REGR_R2(salary, years_experience) AS r_squared
FROM employees
GROUP BY department_id;

ℹ️

Key Insight: Use _POP for population statistics and _SAMP for sample statistics. CORR() returns the Pearson correlation coefficient (-1 to 1). REGR_SLOPE and REGR_INTERCEPT compute linear regression coefficients.

Linear Regression Analysis

-- Perform linear regression per department
WITH regression_stats AS (
  SELECT
    department_id,
    REGR_SLOPE(salary, years_experience) AS slope,
    REGR_INTERCEPT(salary, years_experience) AS intercept,
    REGR_R2(salary, years_experience) AS r_squared
  FROM employees
  GROUP BY department_id
)
SELECT
  e.employee_id,
  e.department_id,
  e.salary,
  e.years_experience,
  r.slope * e.years_experience + r.intercept AS predicted_salary,
  e.salary - (r.slope * e.years_experience + r.intercept) AS residual,
  r.r_squared
FROM employees e
INNER JOIN regression_stats r ON e.department_id = r.department_id;

Percentile and Median Calculations

-- Compute percentiles with different methods
SELECT
  department_id,
  PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY salary) AS median_cont,
  PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY salary) AS median_disc,
  PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY salary) AS q1,
  PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY salary) AS q3,
  PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY salary) AS p95,
  PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY salary) AS p99
FROM employees
GROUP BY department_id;

Moving Aggregations

-- Complex moving window calculations
SELECT
  sale_date,
  revenue,
  -- 7-day moving average
  AVG(revenue) OVER w7 AS ma_7day,
  -- 30-day moving sum
  SUM(revenue) OVER w30 AS sum_30day,
  -- Moving standard deviation
  STDDEV(revenue) OVER w7 AS stddev_7day,
  -- Moving correlation
  CORR(revenue, units_sold) OVER w7 AS corr_7day,
  -- Moving linear regression
  REGR_SLOPE(revenue, units_sold) OVER w7 AS slope_7day
FROM daily_sales
WINDOW
  w7 AS (ORDER BY sale_date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW),
  w30 AS (ORDER BY sale_date ROWS BETWEEN 29 PRECEDING AND CURRENT ROW);

⚠️

Performance Note: Named window clauses (WINDOW clause) improve readability and ensure consistency. Define them once and reference throughout the query.

Ratio Calculations

-- Compute various ratios
SELECT
  department_id,
  employee_id,
  salary,
  -- Ratio to department average
  salary / AVG(salary) OVER (PARTITION BY department_id) AS ratio_to_avg,
  -- Ratio to max
  salary / MAX(salary) OVER (PARTITION BY department_id) AS ratio_to_max,
  -- Percentage of department total
  salary / SUM(salary) OVER (PARTITION BY department_id) * 100 AS pct_of_dept_total,
  -- Running ratio
  salary / SUM(salary) OVER (
    PARTITION BY department_id
    ORDER BY salary DESC
    ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
  ) * 100 AS cumulative_pct
FROM employees;

Statistical Outlier Detection

-- Identify outliers using Z-score
WITH stats AS (
  SELECT
    department_id,
    AVG(salary) AS mean_salary,
    STDDEV(salary) AS stddev_salary
  FROM employees
  GROUP BY department_id
)
SELECT
  e.employee_id,
  e.department_id,
  e.salary,
  (e.salary - s.mean_salary) / NULLIF(s.stddev_salary, 0) AS z_score,
  CASE
    WHEN ABS((e.salary - s.mean_salary) / NULLIF(s.stddev_salary, 0)) > 3
      THEN 'OUTLIER'
    WHEN ABS((e.salary - s.mean_salary) / NULLIF(s.stddev_salary, 0)) > 2
      THEN 'FAR'
    ELSE 'NORMAL'
  END AS salary_classification
FROM employees e
INNER JOIN stats s ON e.department_id = s.department_id;

BigQuery Analytic Functions

-- BigQuery-specific analytic functions
SELECT
  user_id,
  event_date,
  -- Approximate count distinct
  APPROX_COUNT_DISTINCT(session_id) OVER w AS approx_sessions,
  -- Top values
  TOP_COUNT(event_type, 5) OVER w AS top_events,
  -- Quantiles
  QUANTILES(revenue, 10) OVER w AS deciles,
  -- Correlation
  CORR(revenue, session_duration) OVER w AS corr_rev_duration
FROM events
WINDOW w AS (PARTITION BY user_id ORDER BY event_date);

Frequency Distribution

-- Create frequency distribution
SELECT
  WIDTH_BUCKET(salary, 30000, 200000, 10) AS salary_bucket,
  COUNT(*) AS frequency,
  ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 1) AS pct,
  SUM(COUNT(*)) OVER (ORDER BY WIDTH_BUCKET(salary, 30000, 200000, 10)
    ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_freq
FROM employees
GROUP BY WIDTH_BUCKET(salary, 30000, 200000, 10)
ORDER BY salary_bucket;

Window Function with FILTER

-- Conditional analytic calculations
SELECT
  department_id,
  employee_id,
  salary,
  -- Average salary (active employees only)
  AVG(salary) FILTER (WHERE status = 'active')
    OVER (PARTITION BY department_id) AS avg_active_salary,
  -- Percent rank among senior employees
  PERCENT_RANK() OVER (
    PARTITION BY department_id
    ORDER BY salary
    FILTER (WHERE years_experience > 5)
  ) AS senior_percent_rank,
  -- Count of high performers
  COUNT(*) FILTER (WHERE performance_rating >= 4)
    OVER (PARTITION BY department_id) AS high_performer_count
FROM employees;

Covariance and Correlation Matrix

-- Compute correlation matrix for multiple variables
SELECT
  'salary' AS var1,
  'years_experience' AS var2,
  CORR(salary, years_experience) AS correlation
FROM employees
UNION ALL
SELECT
  'salary',
  'performance_rating',
  CORR(salary, performance_rating)
FROM employees
UNION ALL
SELECT
  'years_experience',
  'performance_rating',
  CORR(years_experience, performance_rating)
FROM employees;

Follow-Up Questions

  1. What's the difference between PERCENTILE_CONT and PERCENTILE_DISC?
  2. How would you compute a weighted moving average using window functions?
  3. Explain the assumptions behind REGR_SLOPE and when they might be violated.
  4. How do you handle NULL values in statistical aggregate functions?
  5. What's the best approach for computing rolling percentiles?
  6. How would you implement a Z-score normalization across partitioned data?

Advertisement