LLM A/B Testing: Statistical Testing and Model Comparison
A/B testing for LLM applications requires careful statistical design to account for non-deterministic outputs, subjective quality metrics, and cost-performance tradeoffs between different models.
A/B Testing Pipeline
Statistical Framework
1. Hypothesis Testing
import numpy as np
from scipy import stats
from dataclasses import dataclass
from typing import List
@dataclass
class ABTestResult:
test_name: str
control_mean: float
treatment_mean: float
lift: float
p_value: float
confidence_level: float
significant: bool
sample_size_control: int
sample_size_treatment: int
class LLMAbTest:
def __init__(self, confidence_level: float = 0.95):
self.confidence_level = confidence_level
self.alpha = 1 - confidence_level
def calculate_sample_size(self, effect_size: float, power: float = 0.8,
std_dev: float = 1.0) -> int:
from scipy.stats import norm
z_alpha = norm.ppf(1 - self.alpha / 2)
z_beta = norm.ppf(power)
n = ((z_alpha + z_beta) ** 2 * 2 * std_dev ** 2) / (effect_size ** 2)
return int(np.ceil(n))
def two_sample_ttest(self, control: List[float],
treatment: List[float]) -> ABTestResult:
control_arr = np.array(control)
treatment_arr = np.array(treatment)
t_stat, p_value = stats.ttest_ind(control_arr, treatment_arr)
control_mean = np.mean(control_arr)
treatment_mean = np.mean(treatment_arr)
lift = (treatment_mean - control_mean) / max(abs(control_mean), 1e-10)
return ABTestResult(
test_name="two_sample_ttest",
control_mean=control_mean,
treatment_mean=treatment_mean,
lift=lift,
p_value=p_value,
confidence_level=self.confidence_level,
significant=p_value < self.alpha,
sample_size_control=len(control),
sample_size_treatment=len(treatment)
)
def proportion_test(self, control_successes: int, control_total: int,
treatment_successes: int,
treatment_total: int) -> ABTestResult:
p_control = control_successes / control_total
p_treatment = treatment_successes / treatment_total
p_pooled = (control_successes + treatment_successes) / (control_total + treatment_total)
se = np.sqrt(p_pooled * (1 - p_pooled) * (1/control_total + 1/treatment_total))
z_stat = (p_treatment - p_control) / se
p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
lift = (p_treatment - p_control) / max(abs(p_control), 1e-10)
return ABTestResult(
test_name="proportion_test",
control_mean=p_control,
treatment_mean=p_treatment,
lift=lift,
p_value=p_value,
confidence_level=self.confidence_level,
significant=p_value < self.alpha,
sample_size_control=control_total,
sample_size_treatment=treatment_total
)
def bootstrap_test(self, control: List[float], treatment: List[float],
n_bootstrap: int = 10000) -> ABTestResult:
control_arr = np.array(control)
treatment_arr = np.array(treatment)
observed_diff = np.mean(treatment_arr) - np.mean(control_arr)
combined = np.concatenate([control_arr, treatment_arr])
bootstrap_diffs = []
for _ in range(n_bootstrap):
np.random.shuffle(combined)
boot_control = combined[:len(control_arr)]
boot_treatment = combined[len(control_arr):]
bootstrap_diffs.append(np.mean(boot_treatment) - np.mean(boot_control))
p_value = np.mean(np.abs(bootstrap_diffs) >= np.abs(observed_diff))
return ABTestResult(
test_name="bootstrap_test",
control_mean=np.mean(control_arr),
treatment_mean=np.mean(treatment_arr),
lift=observed_diff / max(abs(np.mean(control_arr)), 1e-10),
p_value=p_value,
confidence_level=self.confidence_level,
significant=p_value < self.alpha,
sample_size_control=len(control),
sample_size_treatment=len(treatment)
)
2. Multi-Armed Bandit
import numpy as np
from dataclasses import dataclass
from typing import List
@dataclass
class Arm:
name: str
model: str
prompt_version: str
pulls: int = 0
rewards: float = 0.0
@property
def average_reward(self) -> float:
return self.rewards / max(self.pulls, 1)
class ThompsonSampling:
def __init__(self, arms: List[Arm], exploration_rate: float = 1.0):
self.arms = arms
self.exploration_rate = exploration_rate
def select_arm(self) -> Arm:
sampled_values = []
for arm in self.arms:
alpha = arm.rewards + 1
beta = arm.pulls - arm.rewards + 1
sampled = np.random.beta(alpha, beta)
sampled_values.append((sampled, arm))
sampled_values.sort(key=lambda x: x[0], reverse=True)
return sampled_values[0][1]
def update(self, arm_name: str, reward: float):
for arm in self.arms:
if arm.name == arm_name:
arm.pulls += 1
arm.rewards += reward
break
def get_stats(self) -> List[dict]:
return [{"name": a.name, "pulls": a.pulls, "avg_reward": a.average_reward}
for a in self.arms]
Key Formulas
Sample Size for Two-Proportion Test
Here,
- =Critical value for significance level
- =Critical value for statistical power
- =Pooled proportion
- =Minimum detectable effect size
Lift Calculation
Here,
- =Mean of treatment group
- =Mean of control group
Comparison Table
| Method | Assumption | Best For | Limitations |
|---|---|---|---|
| T-Test | Normal distribution | Continuous metrics | Parametric |
| Proportion Test | Binary outcomes | Click rates, conversions | Binary only |
| Bootstrap | None | Non-normal data | Computationally expensive |
| Thompson Sampling | Stationary rewards | Online optimization | Requires reward signal |
Best Practices
- Pre-calculate sample size before starting the experiment
- Run for sufficient duration to capture weekly patterns
- Use sequential testing for early stopping when effects are clear
- Track cost alongside quality since model performance has cost implications
- Avoid peeking at results repeatedly without correction