🎉 75% of content is free forever — Unlock Premium from $10/mo →
CW
Search courses…
💼 Servicesℹ️ About✉️ ContactView Pricing Plansfrom $10

LLM Testing: Unit Tests, Integration Tests, and Regression Testing

LLMOps in ProductionLLM Testing Strategies🟢 Free Lesson

Advertisement

LLM Testing: Unit Tests, Integration Tests, and Regression Testing

Testing LLM applications requires fundamentally different approaches from traditional software testing due to non-deterministic outputs, subjective quality metrics, and evolving model behavior.

LLM Testing Pipeline

Testing Taxonomy

1. Prompt Unit Tests

These tests validate prompt templates and formatting without executing the full LLM call chain.

import pytest
from typing import Dict, List, Any

class PromptUnitTest:
    """Unit tests for LLM prompt construction and validation."""

    def __init__(self, prompt_template: str):
        self.template = prompt_template
        self.variables = {}
        self.test_cases: List[Dict[str, Any]] = []

    def set_variables(self, **kwargs):
        self.variables = kwargs
        return self

    def add_test_case(self, name: str, expected_contains: str = None,
                      expected_not_contains: str = None,
                      max_length: int = None):
        self.test_cases.append({
            "name": name,
            "expected_contains": expected_contains,
            "expected_not_contains": expected_not_contains,
            "max_length": max_length
        })
        return self

    def render(self) -> str:
        rendered = self.template
        for key, value in self.variables.items():
            rendered = rendered.replace(f"{{{{{key}}}}}", str(value))
        return rendered

    def run(self) -> List[Dict[str, Any]]:
        rendered = self.render()
        results = []
        for test in self.test_cases:
            passed = True
            errors = []
            if test["expected_contains"] and test["expected_contains"] not in rendered:
                passed = False
                errors.append(f"Missing: {test['expected_contains']}")
            if test["expected_not_contains"] and test["expected_not_contains"] in rendered:
                passed = False
                errors.append(f"Unexpected: {test['expected_not_contains']}")
            if test["max_length"] and len(rendered) > test["max_length"]:
                passed = False
                errors.append(f"Too long: {len(rendered)} > {test['max_length']}")
            results.append({"name": test["name"], "passed": passed, "errors": errors})
        return results


def test_prompt_rendering():
    template = "You are a {role}. Answer {question} concisely."
    test = PromptUnitTest(template)
    test.set_variables(role="customer support agent", question="What is your return policy?")
    test.add_test_case("contains role", expected_contains="customer support agent")
    test.add_test_case("contains question", expected_contains="return policy")
    test.add_test_case("reasonable length", max_length=200)
    results = test.run()
    assert all(r["passed"] for r in results)

2. Output Quality Tests

import re
from typing import Optional
from dataclasses import dataclass

@dataclass
class QualityCheck:
    name: str
    passed: bool
    score: float
    details: str

class LLMOutputTester:
    def __init__(self):
        self.checks: list = []

    def check_relevance(self, query: str, response: str, threshold: float = 0.5) -> QualityCheck:
        query_words = set(query.lower().split())
        response_words = set(response.lower().split())
        overlap = len(query_words & response_words) / max(len(query_words), 1)
        return QualityCheck(
            name="relevance",
            passed=overlap >= threshold,
            score=overlap,
            details=f"Keyword overlap: {overlap:.2%}"
        )

    def check_format(self, response: str, pattern: str) -> QualityCheck:
        match = bool(re.search(pattern, response))
        return QualityCheck(
            name="format",
            passed=match,
            score=1.0 if match else 0.0,
            details=f"Pattern match: {match}"
        )

    def check_length(self, response: str, min_len: int = 10, max_len: int = 2000) -> QualityCheck:
        length = len(response)
        passed = min_len <= length <= max_len
        return QualityCheck(
            name="length",
            passed=passed,
            score=1.0 if passed else 0.0,
            details=f"Length: {length} (range: {min_len}-{max_len})"
        )

    def check_no_hallucination(self, response: str, forbidden: list[str]) -> QualityCheck:
        found = [f for f in forbidden if f.lower() in response.lower()]
        return QualityCheck(
            name="no_hallucination",
            passed=len(found) == 0,
            score=1.0 - (len(found) / max(len(forbidden), 1)),
            details=f"Forbidden terms found: {found}" if found else "No hallucinations"
        )

    def run_all(self, query: str, response: str, config: Optional[Dict] = None) -> list:
        config = config or {}
        checks = [
            self.check_relevance(query, response, config.get("relevance_threshold", 0.5)),
            self.check_length(response, config.get("min_length", 10), config.get("max_length", 2000)),
        ]
        if "forbidden" in config:
            checks.append(self.check_no_hallucination(response, config["forbidden"]))
        self.checks = checks
        return checks

3. Regression Testing

import json
import hashlib
from pathlib import Path
from typing import Dict, List, Any
from datetime import datetime

class LLMRegressionTest:
    def __init__(self, baseline_path: str = "./baselines"):
        self.baseline_path = Path(baseline_path)
        self.baseline_path.mkdir(parents=True, exist_ok=True)

    def load_baseline(self, test_name: str) -> Optional[Dict]:
        path = self.baseline_path / f"{test_name}.json"
        if path.exists():
            return json.loads(path.read_text())
        return None

    def save_baseline(self, test_name: str, result: Dict):
        result["timestamp"] = datetime.utcnow().isoformat()
        result["hash"] = hashlib.sha256(json.dumps(result["response"]).encode()).hexdigest()[:16]
        path = self.baseline_path / f"{test_name}.json"
        path.write_text(json.dumps(result, indent=2))

    def compare_responses(self, baseline: str, current: str, threshold: float = 0.8) -> Dict:
        baseline_words = set(baseline.lower().split())
        current_words = set(current.lower().split())
        intersection = len(baseline_words & current_words)
        union = len(baseline_words | current_words)
        similarity = intersection / max(union, 1)
        return {
            "similarity": similarity,
            "passed": similarity >= threshold,
            "baseline_length": len(baseline),
            "current_length": len(current),
            "length_change": len(current) - len(baseline)
        }

    def run_regression(self, test_name: str, prompt: str, current_response: str) -> Dict:
        baseline = self.load_baseline(test_name)
        if baseline is None:
            self.save_baseline(test_name, {"prompt": prompt, "response": current_response})
            return {"status": "baseline_created", "test_name": test_name}
        comparison = self.compare_responses(baseline["response"], current_response)
        return {
            "test_name": test_name,
            "status": "passed" if comparison["passed"] else "regression_detected",
            **comparison
        }

Key Formulas

Semantic Similarity

sim(A,B)=ABAB\text{sim}(A, B) = \frac{\vec{A} \cdot \vec{B}}{||\vec{A}|| \cdot ||\vec{B}||}

Here,

  • A\vec{A}=Embedding vector of baseline response
  • B\vec{B}=Embedding vector of current response

Test Coverage

Coverage=Tested ScenariosAll Scenarios×100%\text{Coverage} = \frac{|\text{Tested Scenarios}|}{|\text{All Scenarios}|} \times 100\%

Here,

  • Tested Scenarios\text{Tested Scenarios}=Scenarios with passing tests
  • All Scenarios\text{All Scenarios}=Total defined test scenarios

Testing Matrix

Test TypeWhat It ValidatesSpeedCostWhen to Run
Prompt UnitTemplate renderingFastLowEvery commit
Output QualityResponse format, lengthMediumMediumEvery PR
RegressionResponse consistencySlowHighPre-deploy
A/B ComparisonModel performanceSlowHighWeekly
Stress TestRate limits, latencySlowHighMonthly
AdversarialSafety, robustnessSlowHighQuarterly

Best Practices

  1. Test prompts independently before connecting to models
  2. Use mock LLM responses for deterministic unit tests
  3. Track regression baselines with versioned snapshots
  4. Set quality thresholds based on domain requirements
  5. Run adversarial tests to validate safety guardrails

Premium Content

LLM Testing: Unit Tests, Integration Tests, and Regression Testing

Unlock this lesson and 900+ advanced tutorials with a Premium plan.

🎯End-to-end Projects
💼Interview Prep
📜Certificates
🤝Community Access

Already a member? Log in

Need Expert AI Ops & LLM Ops Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement