LLM Testing: Unit Tests, Integration Tests, and Regression Testing
Testing LLM applications requires fundamentally different approaches from traditional software testing due to non-deterministic outputs, subjective quality metrics, and evolving model behavior.
LLM Testing Pipeline
Testing Taxonomy
1. Prompt Unit Tests
These tests validate prompt templates and formatting without executing the full LLM call chain.
import pytest
from typing import Dict, List, Any
class PromptUnitTest:
"""Unit tests for LLM prompt construction and validation."""
def __init__(self, prompt_template: str):
self.template = prompt_template
self.variables = {}
self.test_cases: List[Dict[str, Any]] = []
def set_variables(self, **kwargs):
self.variables = kwargs
return self
def add_test_case(self, name: str, expected_contains: str = None,
expected_not_contains: str = None,
max_length: int = None):
self.test_cases.append({
"name": name,
"expected_contains": expected_contains,
"expected_not_contains": expected_not_contains,
"max_length": max_length
})
return self
def render(self) -> str:
rendered = self.template
for key, value in self.variables.items():
rendered = rendered.replace(f"{{{{{key}}}}}", str(value))
return rendered
def run(self) -> List[Dict[str, Any]]:
rendered = self.render()
results = []
for test in self.test_cases:
passed = True
errors = []
if test["expected_contains"] and test["expected_contains"] not in rendered:
passed = False
errors.append(f"Missing: {test['expected_contains']}")
if test["expected_not_contains"] and test["expected_not_contains"] in rendered:
passed = False
errors.append(f"Unexpected: {test['expected_not_contains']}")
if test["max_length"] and len(rendered) > test["max_length"]:
passed = False
errors.append(f"Too long: {len(rendered)} > {test['max_length']}")
results.append({"name": test["name"], "passed": passed, "errors": errors})
return results
def test_prompt_rendering():
template = "You are a {role}. Answer {question} concisely."
test = PromptUnitTest(template)
test.set_variables(role="customer support agent", question="What is your return policy?")
test.add_test_case("contains role", expected_contains="customer support agent")
test.add_test_case("contains question", expected_contains="return policy")
test.add_test_case("reasonable length", max_length=200)
results = test.run()
assert all(r["passed"] for r in results)
2. Output Quality Tests
import re
from typing import Optional
from dataclasses import dataclass
@dataclass
class QualityCheck:
name: str
passed: bool
score: float
details: str
class LLMOutputTester:
def __init__(self):
self.checks: list = []
def check_relevance(self, query: str, response: str, threshold: float = 0.5) -> QualityCheck:
query_words = set(query.lower().split())
response_words = set(response.lower().split())
overlap = len(query_words & response_words) / max(len(query_words), 1)
return QualityCheck(
name="relevance",
passed=overlap >= threshold,
score=overlap,
details=f"Keyword overlap: {overlap:.2%}"
)
def check_format(self, response: str, pattern: str) -> QualityCheck:
match = bool(re.search(pattern, response))
return QualityCheck(
name="format",
passed=match,
score=1.0 if match else 0.0,
details=f"Pattern match: {match}"
)
def check_length(self, response: str, min_len: int = 10, max_len: int = 2000) -> QualityCheck:
length = len(response)
passed = min_len <= length <= max_len
return QualityCheck(
name="length",
passed=passed,
score=1.0 if passed else 0.0,
details=f"Length: {length} (range: {min_len}-{max_len})"
)
def check_no_hallucination(self, response: str, forbidden: list[str]) -> QualityCheck:
found = [f for f in forbidden if f.lower() in response.lower()]
return QualityCheck(
name="no_hallucination",
passed=len(found) == 0,
score=1.0 - (len(found) / max(len(forbidden), 1)),
details=f"Forbidden terms found: {found}" if found else "No hallucinations"
)
def run_all(self, query: str, response: str, config: Optional[Dict] = None) -> list:
config = config or {}
checks = [
self.check_relevance(query, response, config.get("relevance_threshold", 0.5)),
self.check_length(response, config.get("min_length", 10), config.get("max_length", 2000)),
]
if "forbidden" in config:
checks.append(self.check_no_hallucination(response, config["forbidden"]))
self.checks = checks
return checks
3. Regression Testing
import json
import hashlib
from pathlib import Path
from typing import Dict, List, Any
from datetime import datetime
class LLMRegressionTest:
def __init__(self, baseline_path: str = "./baselines"):
self.baseline_path = Path(baseline_path)
self.baseline_path.mkdir(parents=True, exist_ok=True)
def load_baseline(self, test_name: str) -> Optional[Dict]:
path = self.baseline_path / f"{test_name}.json"
if path.exists():
return json.loads(path.read_text())
return None
def save_baseline(self, test_name: str, result: Dict):
result["timestamp"] = datetime.utcnow().isoformat()
result["hash"] = hashlib.sha256(json.dumps(result["response"]).encode()).hexdigest()[:16]
path = self.baseline_path / f"{test_name}.json"
path.write_text(json.dumps(result, indent=2))
def compare_responses(self, baseline: str, current: str, threshold: float = 0.8) -> Dict:
baseline_words = set(baseline.lower().split())
current_words = set(current.lower().split())
intersection = len(baseline_words & current_words)
union = len(baseline_words | current_words)
similarity = intersection / max(union, 1)
return {
"similarity": similarity,
"passed": similarity >= threshold,
"baseline_length": len(baseline),
"current_length": len(current),
"length_change": len(current) - len(baseline)
}
def run_regression(self, test_name: str, prompt: str, current_response: str) -> Dict:
baseline = self.load_baseline(test_name)
if baseline is None:
self.save_baseline(test_name, {"prompt": prompt, "response": current_response})
return {"status": "baseline_created", "test_name": test_name}
comparison = self.compare_responses(baseline["response"], current_response)
return {
"test_name": test_name,
"status": "passed" if comparison["passed"] else "regression_detected",
**comparison
}
Key Formulas
Semantic Similarity
Here,
- =Embedding vector of baseline response
- =Embedding vector of current response
Test Coverage
Here,
- =Scenarios with passing tests
- =Total defined test scenarios
Testing Matrix
| Test Type | What It Validates | Speed | Cost | When to Run |
|---|---|---|---|---|
| Prompt Unit | Template rendering | Fast | Low | Every commit |
| Output Quality | Response format, length | Medium | Medium | Every PR |
| Regression | Response consistency | Slow | High | Pre-deploy |
| A/B Comparison | Model performance | Slow | High | Weekly |
| Stress Test | Rate limits, latency | Slow | High | Monthly |
| Adversarial | Safety, robustness | Slow | High | Quarterly |
Best Practices
- Test prompts independently before connecting to models
- Use mock LLM responses for deterministic unit tests
- Track regression baselines with versioned snapshots
- Set quality thresholds based on domain requirements
- Run adversarial tests to validate safety guardrails