LLM Testing: Unit Tests, Integration Tests, and Regression Testing

Testing LLM applications requires fundamentally different approaches from traditional software testing due to non-deterministic outputs, subjective quality metrics, and evolving model behavior.

LLM Testing Pipeline

Testing Taxonomy

1. Prompt Unit Tests

These tests validate prompt templates and formatting without executing the full LLM call chain.

import pytest
from typing import Dict, List, Any

class PromptUnitTest:
    """Unit tests for LLM prompt construction and validation."""

    def __init__(self, prompt_template: str):
        self.template = prompt_template
        self.variables = {}
        self.test_cases: List[Dict[str, Any]] = []

    def set_variables(self, **kwargs):
        self.variables = kwargs
        return self

    def add_test_case(self, name: str, expected_contains: str = None,
                      expected_not_contains: str = None,
                      max_length: int = None):
        self.test_cases.append({
            "name": name,
            "expected_contains": expected_contains,
            "expected_not_contains": expected_not_contains,
            "max_length": max_length
        })
        return self

    def render(self) -> str:
        rendered = self.template
        for key, value in self.variables.items():
            rendered = rendered.replace(f"{{{{{key}}}}}", str(value))
        return rendered

    def run(self) -> List[Dict[str, Any]]:
        rendered = self.render()
        results = []
        for test in self.test_cases:
            passed = True
            errors = []
            if test["expected_contains"] and test["expected_contains"] not in rendered:
                passed = False
                errors.append(f"Missing: {test['expected_contains']}")
            if test["expected_not_contains"] and test["expected_not_contains"] in rendered:
                passed = False
                errors.append(f"Unexpected: {test['expected_not_contains']}")
            if test["max_length"] and len(rendered) > test["max_length"]:
                passed = False
                errors.append(f"Too long: {len(rendered)} > {test['max_length']}")
            results.append({"name": test["name"], "passed": passed, "errors": errors})
        return results


def test_prompt_rendering():
    template = "You are a {role}. Answer {question} concisely."
    test = PromptUnitTest(template)
    test.set_variables(role="customer support agent", question="What is your return policy?")
    test.add_test_case("contains role", expected_contains="customer support agent")
    test.add_test_case("contains question", expected_contains="return policy")
    test.add_test_case("reasonable length", max_length=200)
    results = test.run()
    assert all(r["passed"] for r in results)

2. Output Quality Tests

import re
from typing import Optional
from dataclasses import dataclass

@dataclass
class QualityCheck:
    name: str
    passed: bool
    score: float
    details: str

class LLMOutputTester:
    def __init__(self):
        self.checks: list = []

    def check_relevance(self, query: str, response: str, threshold: float = 0.5) -> QualityCheck:
        query_words = set(query.lower().split())
        response_words = set(response.lower().split())
        overlap = len(query_words & response_words) / max(len(query_words), 1)
        return QualityCheck(
            name="relevance",
            passed=overlap >= threshold,
            score=overlap,
            details=f"Keyword overlap: {overlap:.2%}"
        )

    def check_format(self, response: str, pattern: str) -> QualityCheck:
        match = bool(re.search(pattern, response))
        return QualityCheck(
            name="format",
            passed=match,
            score=1.0 if match else 0.0,
            details=f"Pattern match: {match}"
        )

    def check_length(self, response: str, min_len: int = 10, max_len: int = 2000) -> QualityCheck:
        length = len(response)
        passed = min_len <= length <= max_len
        return QualityCheck(
            name="length",
            passed=passed,
            score=1.0 if passed else 0.0,
            details=f"Length: {length} (range: {min_len}-{max_len})"
        )

    def check_no_hallucination(self, response: str, forbidden: list[str]) -> QualityCheck:
        found = [f for f in forbidden if f.lower() in response.lower()]
        return QualityCheck(
            name="no_hallucination",
            passed=len(found) == 0,
            score=1.0 - (len(found) / max(len(forbidden), 1)),
            details=f"Forbidden terms found: {found}" if found else "No hallucinations"
        )

    def run_all(self, query: str, response: str, config: Optional[Dict] = None) -> list:
        config = config or {}
        checks = [
            self.check_relevance(query, response, config.get("relevance_threshold", 0.5)),
            self.check_length(response, config.get("min_length", 10), config.get("max_length", 2000)),
        ]
        if "forbidden" in config:
            checks.append(self.check_no_hallucination(response, config["forbidden"]))
        self.checks = checks
        return checks

3. Regression Testing

import json
import hashlib
from pathlib import Path
from typing import Dict, List, Any
from datetime import datetime

class LLMRegressionTest:
    def __init__(self, baseline_path: str = "./baselines"):
        self.baseline_path = Path(baseline_path)
        self.baseline_path.mkdir(parents=True, exist_ok=True)

    def load_baseline(self, test_name: str) -> Optional[Dict]:
        path = self.baseline_path / f"{test_name}.json"
        if path.exists():
            return json.loads(path.read_text())
        return None

    def save_baseline(self, test_name: str, result: Dict):
        result["timestamp"] = datetime.utcnow().isoformat()
        result["hash"] = hashlib.sha256(json.dumps(result["response"]).encode()).hexdigest()[:16]
        path = self.baseline_path / f"{test_name}.json"
        path.write_text(json.dumps(result, indent=2))

    def compare_responses(self, baseline: str, current: str, threshold: float = 0.8) -> Dict:
        baseline_words = set(baseline.lower().split())
        current_words = set(current.lower().split())
        intersection = len(baseline_words & current_words)
        union = len(baseline_words | current_words)
        similarity = intersection / max(union, 1)
        return {
            "similarity": similarity,
            "passed": similarity >= threshold,
            "baseline_length": len(baseline),
            "current_length": len(current),
            "length_change": len(current) - len(baseline)
        }

    def run_regression(self, test_name: str, prompt: str, current_response: str) -> Dict:
        baseline = self.load_baseline(test_name)
        if baseline is None:
            self.save_baseline(test_name, {"prompt": prompt, "response": current_response})
            return {"status": "baseline_created", "test_name": test_name}
        comparison = self.compare_responses(baseline["response"], current_response)
        return {
            "test_name": test_name,
            "status": "passed" if comparison["passed"] else "regression_detected",
            **comparison
        }

Key Formulas

Semantic Similarity

\text{sim}(A, B) = \frac{\vec{A} \cdot \vec{B}}{||\vec{A}|| \cdot ||\vec{B}||}

Here,

$\vec{A}$ =Embedding vector of baseline response
$\vec{B}$ =Embedding vector of current response

Test Coverage

\text{Coverage} = \frac{|\text{Tested Scenarios}|}{|\text{All Scenarios}|} \times 100\%

Here,

$\text{Tested Scenarios}$ =Scenarios with passing tests
$\text{All Scenarios}$ =Total defined test scenarios

Testing Matrix

Test Type	What It Validates	Speed	Cost	When to Run
Prompt Unit	Template rendering	Fast	Low	Every commit
Output Quality	Response format, length	Medium	Medium	Every PR
Regression	Response consistency	Slow	High	Pre-deploy
A/B Comparison	Model performance	Slow	High	Weekly
Stress Test	Rate limits, latency	Slow	High	Monthly
Adversarial	Safety, robustness	Slow	High	Quarterly

Best Practices

Test prompts independently before connecting to models
Use mock LLM responses for deterministic unit tests
Track regression baselines with versioned snapshots
Set quality thresholds based on domain requirements
Run adversarial tests to validate safety guardrails

LLM Testing: Unit Tests, Integration Tests, and Regression Testing

LLM Testing: Unit Tests, Integration Tests, and Regression Testing

LLM Testing Pipeline

Testing Taxonomy

1. Prompt Unit Tests

2. Output Quality Tests

3. Regression Testing

Key Formulas

Semantic Similarity

Test Coverage

Testing Matrix

Best Practices

Premium Content

Need Expert AI Ops & LLM Ops Help?