LLM CI/CD: Continuous Integration and Deployment
CI/CD for LLM applications requires unique considerations: non-deterministic outputs, cost-per-test, multi-model compatibility, and gradual rollout strategies that protect production users.
LLM CI/CD Pipeline
Pipeline Stages
1. Prompt Version Control
import json
import hashlib
from pathlib import Path
from typing import Dict, Optional
from dataclasses import dataclass, field
from datetime import datetime
@dataclass
class PromptVersion:
version_id: str
prompt_name: str
template: str
variables: Dict[str, str]
model: str
created_at: str = field(default_factory=lambda: datetime.utcnow().isoformat())
author: str = "system"
metadata: Dict = field(default_factory=dict)
@property
def content_hash(self) -> str:
content = f"{self.template}{json.dumps(self.variables, sort_keys=True)}{self.model}"
return hashlib.sha256(content.encode()).hexdigest()[:12]
class PromptRegistry:
def __init__(self, registry_path: str = "./prompts"):
self.registry_path = Path(registry_path)
self.registry_path.mkdir(parents=True, exist_ok=True)
def register(self, version: PromptVersion) -> str:
versions_dir = self.registry_path / version.prompt_name
versions_dir.mkdir(exist_ok=True)
path = versions_dir / f"{version.version_id}.json"
path.write_text(json.dumps({
"version_id": version.version_id,
"prompt_name": version.prompt_name,
"template": version.template,
"variables": version.variables,
"model": version.model,
"created_at": version.created_at,
"author": version.author,
"content_hash": version.content_hash,
"metadata": version.metadata
}, indent=2))
return version.version_id
def get_latest(self, prompt_name: str) -> Optional[PromptVersion]:
versions_dir = self.registry_path / prompt_name
if not versions_dir.exists():
return None
files = sorted(versions_dir.glob("*.json"))
if not files:
return None
data = json.loads(files[-1].read_text())
return PromptVersion(**{k: v for k, v in data.items() if k != "content_hash"})
def get_version(self, prompt_name: str, version_id: str) -> Optional[PromptVersion]:
path = self.registry_path / prompt_name / f"{version_id}.json"
if path.exists():
data = json.loads(path.read_text())
return PromptVersion(**{k: v for k, v in data.items() if k != "content_hash"})
return None
2. Automated Quality Gates
from dataclasses import dataclass
from typing import List, Callable
@dataclass
class QualityGateResult:
gate_name: str
passed: bool
score: float
threshold: float
details: str
class LLMQualityGate:
def __init__(self):
self.checks: List[Callable] = []
self.results: List[QualityGateResult] = []
def add_check(self, name: str, check_fn: Callable, threshold: float):
self.checks.append({"name": name, "fn": check_fn, "threshold": threshold})
def evaluate(self, prompt: str, response: str) -> List[QualityGateResult]:
self.results = []
for check in self.checks:
score = check["fn"](prompt, response)
result = QualityGateResult(
gate_name=check["name"],
passed=score >= check["threshold"],
score=score,
threshold=check["threshold"],
details=f"Score: {score:.3f} (threshold: {check['threshold']:.3f})"
)
self.results.append(result)
return self.results
def all_passed(self) -> bool:
return all(r.passed for r in self.results)
def summary(self) -> str:
lines = [f"{'PASS' if r.passed else 'FAIL'}: {r.gate_name} - {r.details}" for r in self.results]
return "\n".join(lines)
3. Canary Deployment Manager
import random
from dataclasses import dataclass
from typing import Dict, List, Optional
from datetime import datetime
@dataclass
class CanaryConfig:
name: str
model: str
prompt_version: str
traffic_percent: float
start_time: datetime
metrics: Dict[str, float] = None
status: str = "active"
class CanaryDeployment:
def __init__(self):
self.canaries: List[CanaryConfig] = []
def deploy_canary(self, name: str, model: str, prompt_version: str,
traffic_percent: float = 5.0) -> CanaryConfig:
canary = CanaryConfig(
name=name,
model=model,
prompt_version=prompt_version,
traffic_percent=traffic_percent,
start_time=datetime.utcnow()
)
self.canaries.append(canary)
return canary
def should_use_canary(self, canary_name: str) -> bool:
for canary in self.canaries:
if canary.name == canary_name and canary.status == "active":
return random.random() * 100 < canary.traffic_percent
return False
def promote_canary(self, canary_name: str, to_percent: float = 100.0):
for canary in self.canaries:
if canary.name == canary_name:
canary.traffic_percent = to_percent
if to_percent >= 100:
canary.status = "promoted"
def rollback_canary(self, canary_name: str):
for canary in self.canaries:
if canary.name == canary_name:
canary.traffic_percent = 0
canary.status = "rolled_back"
Key Formulas
Canary Success Rate
Here,
- =Number of positive feedback responses
- =Total canary responses served
Deployment Confidence Score
Here,
- =Probability of failure for quality check i
- =Number of quality checks
CI/CD Configuration
| Stage | Tool | Trigger | Timeout | Pass Criteria |
|---|---|---|---|---|
| Lint | Ruff/Black | Every commit | 1 min | No errors |
| Unit Tests | Pytest | Every commit | 5 min | 100% pass |
| Integration | Pytest | Pull request | 15 min | 95% pass |
| Quality Gate | Custom | Pull request | 10 min | All gates pass |
| Canary Deploy | Argo Rollouts | Merge to main | 30 min | Metrics stable |
| Full Rollout | Argo Rollouts | After canary | 1 hour | No regressions |
Best Practices
- Version every prompt with semantic versioning
- Use mock LLM calls in CI to reduce cost and improve speed
- Run quality gates before merging to main
- Deploy canaries at low traffic percentages first
- Monitor canary metrics for at least 24 hours before promotion
- Automate rollback when quality metrics degrade