Advanced A/B Testing
Standard A/B testing has limitations: fixed sample sizes, delayed results, and wasted traffic on losing variants. Advanced methods like sequential testing, bandits, and CUPED address these issues, making experimentation faster and more efficient.
A/B Testing Statistical Framework
Why Advanced A/B Testing Matters
Companies run hundreds of experiments simultaneously. Sequential testing lets you stop early when results are clear. Multi-armed bandits dynamically allocate traffic to winners. CUPED reduces variance, requiring smaller sample sizes.
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import norm, ttest_ind
import warnings
warnings.filterwarnings('ignore')
Sequential Testing
Test continuously without inflating error rates.
class SequentialTest:
"""Sequential probability ratio test (SPRT)."""
def __init__(self, alpha=0.05, beta=0.2, mde=0.02):
self.alpha = alpha
self.beta = beta
self.mde = mde
self.log_likelihood_ratio = 0
self.n_control = 0
self.n_treatment = 0
def update(self, control_value, treatment_value):
"""Update with new observation."""
self.n_control += 1
self.n_treatment += 1
# Simplified LLR update
p_treat = treatment_value
p_control = control_value
if p_control > 0 and p_treatment > 0:
lr = np.log(p_treatment / p_control) + np.log((1 - p_treatment) / (1 - p_control))
self.log_likelihood_ratio += lr
def get_decision(self):
"""Make decision: continue, accept H0, or accept H1."""
upper_bound = np.log((1 - self.beta) / self.alpha)
lower_bound = np.log(self.beta / (1 - self.alpha))
if self.log_likelihood_ratio >= upper_bound:
return 'accept_treatment', self.log_likelihood_ratio
elif self.log_likelihood_ratio <= lower_bound:
return 'accept_control', self.log_likelihood_ratio
else:
return 'continue', self.log_likelihood_ratio
# Simulate sequential test
sprt = SequentialTest(alpha=0.05, beta=0.2)
np.random.seed(42)
for i in range(1000):
control = np.random.binomial(1, 0.10) # 10% conversion
treatment = np.random.binomial(1, 0.12) # 12% conversion (true effect)
sprt.update(control, treatment)
decision, llr = sprt.get_decision()
if decision != 'continue':
print(f"Stopped at observation {i+1}: {decision} (LLR={llr:.2f})")
break
Group Sequential Methods
Group sequential tests allow early stopping while controlling overall Type I error. The O'Brien-Fleming boundaries use testing thresholds that tighten over time:
where is the total number of looks, is the current look, and is the standard normal critical value. Early boundaries are more conservative to prevent false positives.
def init(self, n_groups=5, alpha=0.05): self.n_groups = n_groups self.alpha = alpha self.boundaries = self._compute_boundaries() self.results = []
def _compute_boundaries(self): """Compute O'Brien-Fleming-like boundaries.""" boundaries = [] for i in range(1, self.n_groups + 1):
Simplified boundaries
z_boundary = norm.ppf(1 - self.alpha / (2 * np.sqrt(i))) boundaries.append(z_boundary) return boundaries
def test_at_boundary(self, group_num, z_stat): """Test if we should stop at this group.""" boundary = self.boundaries[group_num - 1] if abs(z_stat) >= boundary: return 'stop', 'reject_h0' return 'continue', None
Simulate
gst = GroupSequentialTest(n_groups=5, alpha=0.05) for group in range(1, 6):
Simulate z-statistic
z_stat = np.random.normal(2.0, 0.5) decision, result = gst.test_at_boundary(group, z_stat) print(f"Group {group}: z={z_stat:.2f}, boundary={gst.boundaries[group-1]:.2f}, decision={decision}")
## Multi-Armed Bandits for A/B/n Testing
```python
class EpsilonGreedyBandit:
"""Epsilon-greedy bandit for adaptive traffic allocation."""
def __init__(self, n_variants, epsilon=0.1):
self.n_variants = n_variants
self.epsilon = epsilon
self.counts = np.zeros(n_variants)
self.values = np.zeros(n_variants)
def select_variant(self):
"""Select variant using epsilon-greedy strategy."""
if np.random.random() < self.epsilon:
return np.random.randint(self.n_variants)
return np.argmax(self.values)
def update(self, variant, reward):
"""Update estimates after observing reward."""
self.counts[variant] += 1
n = self.counts[variant]
self.values[variant] = (n - 1) / n * self.values[variant] + 1 / n * reward
def get_allocation(self):
"""Get current traffic allocation."""
total = self.counts.sum()
if total == 0:
return np.ones(self.n_variants) / self.n_variants
return self.counts / total
# Thompson Sampling Bandit
class ThompsonSamplingBandit:
"""Thompson sampling for adaptive experimentation."""
def __init__(self, n_variants):
self.n_variants = n_variants
self.alpha = np.ones(n_variants) # successes
self.beta = np.ones(n_variants) # failures
def select_variant(self):
"""Select variant by sampling from Beta distributions."""
samples = np.random.beta(self.alpha, self.beta)
return np.argmax(samples)
def update(self, variant, reward):
"""Update posterior after observation."""
if reward > 0.5:
self.alpha[variant] += 1
else:
self.beta[variant] += 1
def get_probability_best(self, n_samples=10000):
"""Estimate probability each variant is best."""
samples = np.random.beta(self.alpha, self.beta, (n_samples, self.n_variants))
best = np.argmax(samples, axis=1)
probs = np.bincount(best, minlength=self.n_variants) / n_samples
return probs
# Simulate
bandit = ThompsonSamplingBandit(n_variants=3)
true_rates = [0.10, 0.12, 0.11]
for round_num in range(1000):
variant = bandit.select_variant()
reward = np.random.binomial(1, true_rates[variant])
bandit.update(variant, reward)
print("Thompson Sampling results:")
probs = bandit.get_probability_best()
for i, (rate, prob) in enumerate(zip(true_rates, probs)):
print(f" Variant {i}: true_rate={rate:.2f}, P(best)={prob:.3f}")
CUPED: Variance Reduction
Controlled-experiment Using Pre-Experiment Data reduces variance by controlling for pre-treatment behavior.
def cuped_adjustment(y_post, y_pre):
"""Apply CUPED variance reduction."""
# Compute theta
cov = np.cov(y_post, y_pre)[0, 1]
var_pre = np.var(y_pre)
theta = cov / (var_pre + 1e-10)
# Adjusted outcome
y_adjusted = y_post - theta * (y_pre - np.mean(y_pre))
return y_adjusted, theta
# Simulate
np.random.seed(42)
n = 10000
# Pre-treatment metric
y_pre = np.random.normal(100, 20, n)
# Treatment effect + noise correlated with pre
treatment = np.concatenate([np.zeros(n//2), np.ones(n//2)])
effect = 5
noise = np.random.normal(0, 10, n)
y_post = 50 + 0.5 * y_pre + effect * treatment + noise
# Without CUPED
from scipy.stats import ttest_ind
t_stat_raw, p_raw = ttest_ind(y_post[treatment == 0], y_post[treatment == 1])
se_raw = np.sqrt(np.var(y_post[treatment == 0])/n + np.var(y_post[treatment == 1])/n)
# With CUPED
y_adjusted, theta = cuped_adjustment(y_post, y_pre)
t_stat_cuped, p_cuped = ttest_ind(y_adjusted[treatment == 0], y_adjusted[treatment == 1])
se_cuped = np.sqrt(np.var(y_adjusted[treatment == 0])/n + np.var(y_adjusted[treatment == 1])/n)
print(f"Without CUPED: effect={y_post[treatment==1].mean()-y_post[treatment==0].mean():.2f}, SE={se_raw:.2f}, p={p_raw:.4f}")
print(f"With CUPED: effect={y_adjusted[treatment==1].mean()-y_adjusted[treatment==0].mean():.2f}, SE={se_cuped:.2f}, p={p_cuped:.4f}")
print(f"Variance reduction: {1 - (se_cuped/se_raw)**2:.1%}")
print(f"Theta (regression coefficient): {theta:.3f}")
Sample Size Calculation
def sample_size_proportions(p1, mde, alpha=0.05, power=0.80):
"""Sample size for comparing two proportions."""
p2 = p1 + mde
z_alpha = norm.ppf(1 - alpha/2)
z_beta = norm.ppf(power)
n = ((z_alpha * np.sqrt(2 * p1 * (1-p1)) +
z_beta * np.sqrt(p1*(1-p1) + p2*(1-p2))) ** 2) / mde**2
return int(np.ceil(n * 2)) # Total for both groups
def sample_size_means(delta, std, alpha=0.05, power=0.80):
"""Sample size for comparing two means."""
z_alpha = norm.ppf(1 - alpha/2)
z_beta = norm.ppf(power)
n = 2 * ((z_alpha + z_beta) * std / delta) ** 2
return int(np.ceil(n * 2))
# Calculate
n_prop = sample_size_proportions(p1=0.10, mde=0.02)
print(f"Sample size for proportions: {n_prop} total ({n_prop//2} per group)")
n_mean = sample_size_means(delta=5, std=20)
print(f"Sample size for means: {n_mean} total ({n_mean//2} per group)")
Bayesian A/B Testing
def bayesian_ab_test(control_conversions, control_total,
treatment_conversions, treatment_total, n_simulations=100000):
"""Bayesian A/B test using Beta-Binomial model."""
# Posterior distributions
alpha_prior, beta_prior = 1, 1
alpha_c = alpha_prior + control_conversions
beta_c = beta_prior + control_total - control_conversions
alpha_t = alpha_prior + treatment_conversions
beta_t = beta_prior + treatment_total - treatment_conversions
# Sample from posteriors
samples_c = np.random.beta(alpha_c, beta_c, n_simulations)
samples_t = np.random.beta(alpha_t, beta_t, n_simulations)
# Probability treatment is better
prob_better = (samples_t > samples_c).mean()
# Expected loss
loss_t = np.maximum(samples_c - samples_t, 0).mean()
loss_c = np.maximum(samples_t - samples_c, 0).mean()
return {
'prob_treatment_better': prob_better,
'expected_loss_treatment': loss_t,
'expected_loss_control': loss_c,
'lift': (samples_t.mean() - samples_c.mean()) / samples_c.mean() * 100
}
# Test
result = bayesian_ab_test(
control_conversions=100, control_total=1000,
treatment_conversions=120, treatment_total=1000
)
print("Bayesian A/B Test Results:")
print(f" P(treatment > control): {result['prob_treatment_better']:.3f}")
print(f" Expected loss (treatment): {result['expected_loss_treatment']:.4f}")
print(f" Lift: {result['lift']:.1f}%")
Best Practices
- Use sequential testing β stop early when results are clear
- Apply CUPED β variance reduction requires smaller samples
- Bayesian for decision-making β probability of being best is intuitive
- Multi-armed bandits for optimization β maximize value during testing
- Pre-register hypotheses β prevent p-hacking
- Check for novelty effects β long-term impact may differ from short-term
Summary
Advanced A/B testing methods β sequential testing, bandits, CUPED, and Bayesian analysis β make experimentation faster, more efficient, and more reliable. Apply these techniques to optimize products with less traffic and quicker decisions.