πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

LLM A/B Testing: Statistical Testing and Model Comparison

LLMOps in ProductionLLM A/B Testing🟒 Free Lesson

Advertisement

LLM A/B Testing: Statistical Testing and Model Comparison

A/B testing for LLM applications requires careful statistical design to account for non-deterministic outputs, subjective quality metrics, and cost-performance tradeoffs between different models.

A/B Testing Pipeline

Statistical Framework

1. Hypothesis Testing

import numpy as np
from scipy import stats
from dataclasses import dataclass
from typing import List

@dataclass
class ABTestResult:
    test_name: str
    control_mean: float
    treatment_mean: float
    lift: float
    p_value: float
    confidence_level: float
    significant: bool
    sample_size_control: int
    sample_size_treatment: int

class LLMAbTest:
    def __init__(self, confidence_level: float = 0.95):
        self.confidence_level = confidence_level
        self.alpha = 1 - confidence_level

    def calculate_sample_size(self, effect_size: float, power: float = 0.8,
                              std_dev: float = 1.0) -> int:
        from scipy.stats import norm
        z_alpha = norm.ppf(1 - self.alpha / 2)
        z_beta = norm.ppf(power)
        n = ((z_alpha + z_beta) ** 2 * 2 * std_dev ** 2) / (effect_size ** 2)
        return int(np.ceil(n))

    def two_sample_ttest(self, control: List[float],
                         treatment: List[float]) -> ABTestResult:
        control_arr = np.array(control)
        treatment_arr = np.array(treatment)
        t_stat, p_value = stats.ttest_ind(control_arr, treatment_arr)
        control_mean = np.mean(control_arr)
        treatment_mean = np.mean(treatment_arr)
        lift = (treatment_mean - control_mean) / max(abs(control_mean), 1e-10)
        return ABTestResult(
            test_name="two_sample_ttest",
            control_mean=control_mean,
            treatment_mean=treatment_mean,
            lift=lift,
            p_value=p_value,
            confidence_level=self.confidence_level,
            significant=p_value < self.alpha,
            sample_size_control=len(control),
            sample_size_treatment=len(treatment)
        )

    def proportion_test(self, control_successes: int, control_total: int,
                        treatment_successes: int,
                        treatment_total: int) -> ABTestResult:
        p_control = control_successes / control_total
        p_treatment = treatment_successes / treatment_total
        p_pooled = (control_successes + treatment_successes) / (control_total + treatment_total)
        se = np.sqrt(p_pooled * (1 - p_pooled) * (1/control_total + 1/treatment_total))
        z_stat = (p_treatment - p_control) / se
        p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
        lift = (p_treatment - p_control) / max(abs(p_control), 1e-10)
        return ABTestResult(
            test_name="proportion_test",
            control_mean=p_control,
            treatment_mean=p_treatment,
            lift=lift,
            p_value=p_value,
            confidence_level=self.confidence_level,
            significant=p_value < self.alpha,
            sample_size_control=control_total,
            sample_size_treatment=treatment_total
        )

    def bootstrap_test(self, control: List[float], treatment: List[float],
                       n_bootstrap: int = 10000) -> ABTestResult:
        control_arr = np.array(control)
        treatment_arr = np.array(treatment)
        observed_diff = np.mean(treatment_arr) - np.mean(control_arr)
        combined = np.concatenate([control_arr, treatment_arr])
        bootstrap_diffs = []
        for _ in range(n_bootstrap):
            np.random.shuffle(combined)
            boot_control = combined[:len(control_arr)]
            boot_treatment = combined[len(control_arr):]
            bootstrap_diffs.append(np.mean(boot_treatment) - np.mean(boot_control))
        p_value = np.mean(np.abs(bootstrap_diffs) >= np.abs(observed_diff))
        return ABTestResult(
            test_name="bootstrap_test",
            control_mean=np.mean(control_arr),
            treatment_mean=np.mean(treatment_arr),
            lift=observed_diff / max(abs(np.mean(control_arr)), 1e-10),
            p_value=p_value,
            confidence_level=self.confidence_level,
            significant=p_value < self.alpha,
            sample_size_control=len(control),
            sample_size_treatment=len(treatment)
        )

2. Multi-Armed Bandit

import numpy as np
from dataclasses import dataclass
from typing import List

@dataclass
class Arm:
    name: str
    model: str
    prompt_version: str
    pulls: int = 0
    rewards: float = 0.0

    @property
    def average_reward(self) -> float:
        return self.rewards / max(self.pulls, 1)

class ThompsonSampling:
    def __init__(self, arms: List[Arm], exploration_rate: float = 1.0):
        self.arms = arms
        self.exploration_rate = exploration_rate

    def select_arm(self) -> Arm:
        sampled_values = []
        for arm in self.arms:
            alpha = arm.rewards + 1
            beta = arm.pulls - arm.rewards + 1
            sampled = np.random.beta(alpha, beta)
            sampled_values.append((sampled, arm))
        sampled_values.sort(key=lambda x: x[0], reverse=True)
        return sampled_values[0][1]

    def update(self, arm_name: str, reward: float):
        for arm in self.arms:
            if arm.name == arm_name:
                arm.pulls += 1
                arm.rewards += reward
                break

    def get_stats(self) -> List[dict]:
        return [{"name": a.name, "pulls": a.pulls, "avg_reward": a.average_reward}
                for a in self.arms]

Key Formulas

Sample Size for Two-Proportion Test

n=(zΞ±/2+zΞ²)2β‹…2p(1βˆ’p)Ξ΄2n = \frac{(z_{\alpha/2} + z_{\beta})^2 \cdot 2p(1-p)}{\delta^2}

Here,

  • zΞ±/2z_{\alpha/2}=Critical value for significance level
  • zΞ²z_{\beta}=Critical value for statistical power
  • pp=Pooled proportion
  • Ξ΄\delta=Minimum detectable effect size

Lift Calculation

Lift=XΛ‰treatmentβˆ’XΛ‰control∣XΛ‰controlβˆ£Γ—100%\text{Lift} = \frac{\bar{X}_{treatment} - \bar{X}_{control}}{|\bar{X}_{control}|} \times 100\%

Here,

  • XΛ‰treatment\bar{X}_{treatment}=Mean of treatment group
  • XΛ‰control\bar{X}_{control}=Mean of control group

Comparison Table

MethodAssumptionBest ForLimitations
T-TestNormal distributionContinuous metricsParametric
Proportion TestBinary outcomesClick rates, conversionsBinary only
BootstrapNoneNon-normal dataComputationally expensive
Thompson SamplingStationary rewardsOnline optimizationRequires reward signal

Best Practices

  1. Pre-calculate sample size before starting the experiment
  2. Run for sufficient duration to capture weekly patterns
  3. Use sequential testing for early stopping when effects are clear
  4. Track cost alongside quality since model performance has cost implications
  5. Avoid peeking at results repeatedly without correction
⭐

Premium Content

LLM A/B Testing: Statistical Testing and Model Comparison

Unlock this lesson and 900+ advanced tutorials with a Premium plan.

🎯End-to-end Projects
πŸ’ΌInterview Prep
πŸ“œCertificates
🀝Community Access

Already a member? Log in

Need Expert AI Ops & LLM Ops Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement