LLM A/B Testing: Statistical Testing and Model Comparison

A/B testing for LLM applications requires careful statistical design to account for non-deterministic outputs, subjective quality metrics, and cost-performance tradeoffs between different models.

A/B Testing Pipeline

Statistical Framework

1. Hypothesis Testing

import numpy as np
from scipy import stats
from dataclasses import dataclass
from typing import List

@dataclass
class ABTestResult:
    test_name: str
    control_mean: float
    treatment_mean: float
    lift: float
    p_value: float
    confidence_level: float
    significant: bool
    sample_size_control: int
    sample_size_treatment: int

class LLMAbTest:
    def __init__(self, confidence_level: float = 0.95):
        self.confidence_level = confidence_level
        self.alpha = 1 - confidence_level

    def calculate_sample_size(self, effect_size: float, power: float = 0.8,
                              std_dev: float = 1.0) -> int:
        from scipy.stats import norm
        z_alpha = norm.ppf(1 - self.alpha / 2)
        z_beta = norm.ppf(power)
        n = ((z_alpha + z_beta) ** 2 * 2 * std_dev ** 2) / (effect_size ** 2)
        return int(np.ceil(n))

    def two_sample_ttest(self, control: List[float],
                         treatment: List[float]) -> ABTestResult:
        control_arr = np.array(control)
        treatment_arr = np.array(treatment)
        t_stat, p_value = stats.ttest_ind(control_arr, treatment_arr)
        control_mean = np.mean(control_arr)
        treatment_mean = np.mean(treatment_arr)
        lift = (treatment_mean - control_mean) / max(abs(control_mean), 1e-10)
        return ABTestResult(
            test_name="two_sample_ttest",
            control_mean=control_mean,
            treatment_mean=treatment_mean,
            lift=lift,
            p_value=p_value,
            confidence_level=self.confidence_level,
            significant=p_value < self.alpha,
            sample_size_control=len(control),
            sample_size_treatment=len(treatment)
        )

    def proportion_test(self, control_successes: int, control_total: int,
                        treatment_successes: int,
                        treatment_total: int) -> ABTestResult:
        p_control = control_successes / control_total
        p_treatment = treatment_successes / treatment_total
        p_pooled = (control_successes + treatment_successes) / (control_total + treatment_total)
        se = np.sqrt(p_pooled * (1 - p_pooled) * (1/control_total + 1/treatment_total))
        z_stat = (p_treatment - p_control) / se
        p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
        lift = (p_treatment - p_control) / max(abs(p_control), 1e-10)
        return ABTestResult(
            test_name="proportion_test",
            control_mean=p_control,
            treatment_mean=p_treatment,
            lift=lift,
            p_value=p_value,
            confidence_level=self.confidence_level,
            significant=p_value < self.alpha,
            sample_size_control=control_total,
            sample_size_treatment=treatment_total
        )

    def bootstrap_test(self, control: List[float], treatment: List[float],
                       n_bootstrap: int = 10000) -> ABTestResult:
        control_arr = np.array(control)
        treatment_arr = np.array(treatment)
        observed_diff = np.mean(treatment_arr) - np.mean(control_arr)
        combined = np.concatenate([control_arr, treatment_arr])
        bootstrap_diffs = []
        for _ in range(n_bootstrap):
            np.random.shuffle(combined)
            boot_control = combined[:len(control_arr)]
            boot_treatment = combined[len(control_arr):]
            bootstrap_diffs.append(np.mean(boot_treatment) - np.mean(boot_control))
        p_value = np.mean(np.abs(bootstrap_diffs) >= np.abs(observed_diff))
        return ABTestResult(
            test_name="bootstrap_test",
            control_mean=np.mean(control_arr),
            treatment_mean=np.mean(treatment_arr),
            lift=observed_diff / max(abs(np.mean(control_arr)), 1e-10),
            p_value=p_value,
            confidence_level=self.confidence_level,
            significant=p_value < self.alpha,
            sample_size_control=len(control),
            sample_size_treatment=len(treatment)
        )

2. Multi-Armed Bandit

import numpy as np
from dataclasses import dataclass
from typing import List

@dataclass
class Arm:
    name: str
    model: str
    prompt_version: str
    pulls: int = 0
    rewards: float = 0.0

    @property
    def average_reward(self) -> float:
        return self.rewards / max(self.pulls, 1)

class ThompsonSampling:
    def __init__(self, arms: List[Arm], exploration_rate: float = 1.0):
        self.arms = arms
        self.exploration_rate = exploration_rate

    def select_arm(self) -> Arm:
        sampled_values = []
        for arm in self.arms:
            alpha = arm.rewards + 1
            beta = arm.pulls - arm.rewards + 1
            sampled = np.random.beta(alpha, beta)
            sampled_values.append((sampled, arm))
        sampled_values.sort(key=lambda x: x[0], reverse=True)
        return sampled_values[0][1]

    def update(self, arm_name: str, reward: float):
        for arm in self.arms:
            if arm.name == arm_name:
                arm.pulls += 1
                arm.rewards += reward
                break

    def get_stats(self) -> List[dict]:
        return [{"name": a.name, "pulls": a.pulls, "avg_reward": a.average_reward}
                for a in self.arms]

Key Formulas

Sample Size for Two-Proportion Test

n = \frac{(z_{\alpha/2} + z_{\beta})^2 \cdot 2p(1-p)}{\delta^2}

Here,

$z_{\alpha/2}$ =Critical value for significance level
$z_{\beta}$ =Critical value for statistical power
$p$ =Pooled proportion
$\delta$ =Minimum detectable effect size

Lift Calculation

\text{Lift} = \frac{\bar{X}_{treatment} - \bar{X}_{control}}{|\bar{X}_{control}|} \times 100\%

Here,

$\bar{X}_{treatment}$ =Mean of treatment group
$\bar{X}_{control}$ =Mean of control group

Comparison Table

Method	Assumption	Best For	Limitations
T-Test	Normal distribution	Continuous metrics	Parametric
Proportion Test	Binary outcomes	Click rates, conversions	Binary only
Bootstrap	None	Non-normal data	Computationally expensive
Thompson Sampling	Stationary rewards	Online optimization	Requires reward signal

Best Practices

Pre-calculate sample size before starting the experiment
Run for sufficient duration to capture weekly patterns
Use sequential testing for early stopping when effects are clear
Track cost alongside quality since model performance has cost implications
Avoid peeking at results repeatedly without correction

LLM A/B Testing: Statistical Testing and Model Comparison

LLM A/B Testing: Statistical Testing and Model Comparison

A/B Testing Pipeline

Statistical Framework

1. Hypothesis Testing

2. Multi-Armed Bandit

Key Formulas

Sample Size for Two-Proportion Test

Lift Calculation

Comparison Table

Best Practices

Premium Content

Need Expert AI Ops & LLM Ops Help?