LLM CI/CD: Continuous Integration and Deployment for LLM Applications

LLM CI/CD: Continuous Integration and Deployment

CI/CD for LLM applications requires unique considerations: non-deterministic outputs, cost-per-test, multi-model compatibility, and gradual rollout strategies that protect production users.

LLM CI/CD Pipeline

Pipeline Stages

1. Prompt Version Control

import json
import hashlib
from pathlib import Path
from typing import Dict, Optional
from dataclasses import dataclass, field
from datetime import datetime

@dataclass
class PromptVersion:
    version_id: str
    prompt_name: str
    template: str
    variables: Dict[str, str]
    model: str
    created_at: str = field(default_factory=lambda: datetime.utcnow().isoformat())
    author: str = "system"
    metadata: Dict = field(default_factory=dict)

    @property
    def content_hash(self) -> str:
        content = f"{self.template}{json.dumps(self.variables, sort_keys=True)}{self.model}"
        return hashlib.sha256(content.encode()).hexdigest()[:12]

class PromptRegistry:
    def __init__(self, registry_path: str = "./prompts"):
        self.registry_path = Path(registry_path)
        self.registry_path.mkdir(parents=True, exist_ok=True)

    def register(self, version: PromptVersion) -> str:
        versions_dir = self.registry_path / version.prompt_name
        versions_dir.mkdir(exist_ok=True)
        path = versions_dir / f"{version.version_id}.json"
        path.write_text(json.dumps({
            "version_id": version.version_id,
            "prompt_name": version.prompt_name,
            "template": version.template,
            "variables": version.variables,
            "model": version.model,
            "created_at": version.created_at,
            "author": version.author,
            "content_hash": version.content_hash,
            "metadata": version.metadata
        }, indent=2))
        return version.version_id

    def get_latest(self, prompt_name: str) -> Optional[PromptVersion]:
        versions_dir = self.registry_path / prompt_name
        if not versions_dir.exists():
            return None
        files = sorted(versions_dir.glob("*.json"))
        if not files:
            return None
        data = json.loads(files[-1].read_text())
        return PromptVersion(**{k: v for k, v in data.items() if k != "content_hash"})

    def get_version(self, prompt_name: str, version_id: str) -> Optional[PromptVersion]:
        path = self.registry_path / prompt_name / f"{version_id}.json"
        if path.exists():
            data = json.loads(path.read_text())
            return PromptVersion(**{k: v for k, v in data.items() if k != "content_hash"})
        return None

2. Automated Quality Gates

from dataclasses import dataclass
from typing import List, Callable

@dataclass
class QualityGateResult:
    gate_name: str
    passed: bool
    score: float
    threshold: float
    details: str

class LLMQualityGate:
    def __init__(self):
        self.checks: List[Callable] = []
        self.results: List[QualityGateResult] = []

    def add_check(self, name: str, check_fn: Callable, threshold: float):
        self.checks.append({"name": name, "fn": check_fn, "threshold": threshold})

    def evaluate(self, prompt: str, response: str) -> List[QualityGateResult]:
        self.results = []
        for check in self.checks:
            score = check["fn"](prompt, response)
            result = QualityGateResult(
                gate_name=check["name"],
                passed=score >= check["threshold"],
                score=score,
                threshold=check["threshold"],
                details=f"Score: {score:.3f} (threshold: {check['threshold']:.3f})"
            )
            self.results.append(result)
        return self.results

    def all_passed(self) -> bool:
        return all(r.passed for r in self.results)

    def summary(self) -> str:
        lines = [f"{'PASS' if r.passed else 'FAIL'}: {r.gate_name} - {r.details}" for r in self.results]
        return "\n".join(lines)

3. Canary Deployment Manager

import random
from dataclasses import dataclass
from typing import Dict, List, Optional
from datetime import datetime

@dataclass
class CanaryConfig:
    name: str
    model: str
    prompt_version: str
    traffic_percent: float
    start_time: datetime
    metrics: Dict[str, float] = None
    status: str = "active"

class CanaryDeployment:
    def __init__(self):
        self.canaries: List[CanaryConfig] = []

    def deploy_canary(self, name: str, model: str, prompt_version: str,
                      traffic_percent: float = 5.0) -> CanaryConfig:
        canary = CanaryConfig(
            name=name,
            model=model,
            prompt_version=prompt_version,
            traffic_percent=traffic_percent,
            start_time=datetime.utcnow()
        )
        self.canaries.append(canary)
        return canary

    def should_use_canary(self, canary_name: str) -> bool:
        for canary in self.canaries:
            if canary.name == canary_name and canary.status == "active":
                return random.random() * 100 < canary.traffic_percent
        return False

    def promote_canary(self, canary_name: str, to_percent: float = 100.0):
        for canary in self.canaries:
            if canary.name == canary_name:
                canary.traffic_percent = to_percent
                if to_percent >= 100:
                    canary.status = "promoted"

    def rollback_canary(self, canary_name: str):
        for canary in self.canaries:
            if canary.name == canary_name:
                canary.traffic_percent = 0
                canary.status = "rolled_back"

Key Formulas

Canary Success Rate

R_{success} = \frac{N_{positive}}{N_{total}} \times 100\%

Here,

$N_{positive}$ =Number of positive feedback responses
$N_{total}$ =Total canary responses served

Deployment Confidence Score

\text{Confidence} = \prod_{i=1}^{n} \left(1 - p_i\right)

Here,

$p_i$ =Probability of failure for quality check i
$n$ =Number of quality checks

CI/CD Configuration

Stage	Tool	Trigger	Timeout	Pass Criteria
Lint	Ruff/Black	Every commit	1 min	No errors
Unit Tests	Pytest	Every commit	5 min	100% pass
Integration	Pytest	Pull request	15 min	95% pass
Quality Gate	Custom	Pull request	10 min	All gates pass
Canary Deploy	Argo Rollouts	Merge to main	30 min	Metrics stable
Full Rollout	Argo Rollouts	After canary	1 hour	No regressions

Best Practices

Version every prompt with semantic versioning
Use mock LLM calls in CI to reduce cost and improve speed
Run quality gates before merging to main
Deploy canaries at low traffic percentages first
Monitor canary metrics for at least 24 hours before promotion
Automate rollback when quality metrics degrade

LLM CI/CD: Continuous Integration and Deployment for LLM Applications

LLM CI/CD: Continuous Integration and Deployment

LLM CI/CD Pipeline

Pipeline Stages

1. Prompt Version Control

2. Automated Quality Gates

3. Canary Deployment Manager

Key Formulas

Canary Success Rate

Deployment Confidence Score

CI/CD Configuration

Best Practices

Premium Content

Need Expert AI Ops & LLM Ops Help?