πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

LLM CI/CD: Continuous Integration and Deployment for LLM Applications

LLMOps in ProductionLLM CI/CD Pipelines🟒 Free Lesson

Advertisement

LLM CI/CD: Continuous Integration and Deployment

CI/CD for LLM applications requires unique considerations: non-deterministic outputs, cost-per-test, multi-model compatibility, and gradual rollout strategies that protect production users.

LLM CI/CD Pipeline

Pipeline Stages

1. Prompt Version Control

import json
import hashlib
from pathlib import Path
from typing import Dict, Optional
from dataclasses import dataclass, field
from datetime import datetime

@dataclass
class PromptVersion:
    version_id: str
    prompt_name: str
    template: str
    variables: Dict[str, str]
    model: str
    created_at: str = field(default_factory=lambda: datetime.utcnow().isoformat())
    author: str = "system"
    metadata: Dict = field(default_factory=dict)

    @property
    def content_hash(self) -> str:
        content = f"{self.template}{json.dumps(self.variables, sort_keys=True)}{self.model}"
        return hashlib.sha256(content.encode()).hexdigest()[:12]

class PromptRegistry:
    def __init__(self, registry_path: str = "./prompts"):
        self.registry_path = Path(registry_path)
        self.registry_path.mkdir(parents=True, exist_ok=True)

    def register(self, version: PromptVersion) -> str:
        versions_dir = self.registry_path / version.prompt_name
        versions_dir.mkdir(exist_ok=True)
        path = versions_dir / f"{version.version_id}.json"
        path.write_text(json.dumps({
            "version_id": version.version_id,
            "prompt_name": version.prompt_name,
            "template": version.template,
            "variables": version.variables,
            "model": version.model,
            "created_at": version.created_at,
            "author": version.author,
            "content_hash": version.content_hash,
            "metadata": version.metadata
        }, indent=2))
        return version.version_id

    def get_latest(self, prompt_name: str) -> Optional[PromptVersion]:
        versions_dir = self.registry_path / prompt_name
        if not versions_dir.exists():
            return None
        files = sorted(versions_dir.glob("*.json"))
        if not files:
            return None
        data = json.loads(files[-1].read_text())
        return PromptVersion(**{k: v for k, v in data.items() if k != "content_hash"})

    def get_version(self, prompt_name: str, version_id: str) -> Optional[PromptVersion]:
        path = self.registry_path / prompt_name / f"{version_id}.json"
        if path.exists():
            data = json.loads(path.read_text())
            return PromptVersion(**{k: v for k, v in data.items() if k != "content_hash"})
        return None

2. Automated Quality Gates

from dataclasses import dataclass
from typing import List, Callable

@dataclass
class QualityGateResult:
    gate_name: str
    passed: bool
    score: float
    threshold: float
    details: str

class LLMQualityGate:
    def __init__(self):
        self.checks: List[Callable] = []
        self.results: List[QualityGateResult] = []

    def add_check(self, name: str, check_fn: Callable, threshold: float):
        self.checks.append({"name": name, "fn": check_fn, "threshold": threshold})

    def evaluate(self, prompt: str, response: str) -> List[QualityGateResult]:
        self.results = []
        for check in self.checks:
            score = check["fn"](prompt, response)
            result = QualityGateResult(
                gate_name=check["name"],
                passed=score >= check["threshold"],
                score=score,
                threshold=check["threshold"],
                details=f"Score: {score:.3f} (threshold: {check['threshold']:.3f})"
            )
            self.results.append(result)
        return self.results

    def all_passed(self) -> bool:
        return all(r.passed for r in self.results)

    def summary(self) -> str:
        lines = [f"{'PASS' if r.passed else 'FAIL'}: {r.gate_name} - {r.details}" for r in self.results]
        return "\n".join(lines)

3. Canary Deployment Manager

import random
from dataclasses import dataclass
from typing import Dict, List, Optional
from datetime import datetime

@dataclass
class CanaryConfig:
    name: str
    model: str
    prompt_version: str
    traffic_percent: float
    start_time: datetime
    metrics: Dict[str, float] = None
    status: str = "active"

class CanaryDeployment:
    def __init__(self):
        self.canaries: List[CanaryConfig] = []

    def deploy_canary(self, name: str, model: str, prompt_version: str,
                      traffic_percent: float = 5.0) -> CanaryConfig:
        canary = CanaryConfig(
            name=name,
            model=model,
            prompt_version=prompt_version,
            traffic_percent=traffic_percent,
            start_time=datetime.utcnow()
        )
        self.canaries.append(canary)
        return canary

    def should_use_canary(self, canary_name: str) -> bool:
        for canary in self.canaries:
            if canary.name == canary_name and canary.status == "active":
                return random.random() * 100 < canary.traffic_percent
        return False

    def promote_canary(self, canary_name: str, to_percent: float = 100.0):
        for canary in self.canaries:
            if canary.name == canary_name:
                canary.traffic_percent = to_percent
                if to_percent >= 100:
                    canary.status = "promoted"

    def rollback_canary(self, canary_name: str):
        for canary in self.canaries:
            if canary.name == canary_name:
                canary.traffic_percent = 0
                canary.status = "rolled_back"

Key Formulas

Canary Success Rate

Rsuccess=NpositiveNtotalΓ—100%R_{success} = \frac{N_{positive}}{N_{total}} \times 100\%

Here,

  • NpositiveN_{positive}=Number of positive feedback responses
  • NtotalN_{total}=Total canary responses served

Deployment Confidence Score

Confidence=∏i=1n(1βˆ’pi)\text{Confidence} = \prod_{i=1}^{n} \left(1 - p_i\right)

Here,

  • pip_i=Probability of failure for quality check i
  • nn=Number of quality checks

CI/CD Configuration

StageToolTriggerTimeoutPass Criteria
LintRuff/BlackEvery commit1 minNo errors
Unit TestsPytestEvery commit5 min100% pass
IntegrationPytestPull request15 min95% pass
Quality GateCustomPull request10 minAll gates pass
Canary DeployArgo RolloutsMerge to main30 minMetrics stable
Full RolloutArgo RolloutsAfter canary1 hourNo regressions

Best Practices

  1. Version every prompt with semantic versioning
  2. Use mock LLM calls in CI to reduce cost and improve speed
  3. Run quality gates before merging to main
  4. Deploy canaries at low traffic percentages first
  5. Monitor canary metrics for at least 24 hours before promotion
  6. Automate rollback when quality metrics degrade
⭐

Premium Content

LLM CI/CD: Continuous Integration and Deployment for LLM Applications

Unlock this lesson and 900+ advanced tutorials with a Premium plan.

🎯End-to-end Projects
πŸ’ΌInterview Prep
πŸ“œCertificates
🀝Community Access

Already a member? Log in

Need Expert AI Ops & LLM Ops Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement