Production ML Checklist

Difficulty: Senior Level | Companies: Google, Meta, Netflix, Uber, Stripe

Production Readiness

Use this checklist before deploying any ML model to production.

ℹ️

Google's ML launch checklist prevents 95% of production incidents through systematic validation.

Pre-Launch Checklist

# production_checklist.py
from typing import Dict, List, Optional
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
import json

class CheckStatus(Enum):
    PASSED = "passed"
    FAILED = "failed"
    SKIPPED = "skipped"
    PENDING = "pending"

@dataclass
class ChecklistItem:
    name: str
    category: str
    status: CheckStatus
    description: str
    owner: Optional[str] = None
    notes: Optional[str] = None

@dataclass
class ProductionChecklist:
    model_name: str
    version: str
    items: List[ChecklistItem] = field(default_factory=list)
    created_at: str = field(default_factory=lambda: datetime.now().isoformat())

class ProductionReadinessChecker:
    def __init__(self):
        self.checklists: List[ProductionChecklist] = []

    def create_checklist(self, model_name: str, version: str) -> ProductionChecklist:
        checklist = ProductionChecklist(model_name=model_name, version=version)
        checklist.items = [
            ChecklistItem("Model Performance", "Model Quality", CheckStatus.PENDING, "Accuracy meets threshold"),
            ChecklistItem("Model Fairness", "Model Quality", CheckStatus.PENDING, "No bias detected"),
            ChecklistItem("Latency SLA", "Performance", CheckStatus.PENDING, "P99 < 100ms"),
            ChecklistItem("Throughput", "Performance", CheckStatus.PENDING, "Handles expected RPS"),
            ChecklistItem("Data Validation", "Data", CheckStatus.PENDING, "Schema validation passes"),
            ChecklistItem("Feature Store", "Data", CheckStatus.PENDING, "Features available"),
            ChecklistItem("Model Registry", "MLOps", CheckStatus.PENDING, "Model registered"),
            ChecklistItem("Monitoring", "MLOps", CheckStatus.PENDING, "Metrics configured"),
            ChecklistItem("Alerting", "MLOps", CheckStatus.PENDING, "Alerts configured"),
            ChecklistItem("Rollback Plan", "Operations", CheckStatus.PENDING, "Rollback tested"),
            ChecklistItem("Load Testing", "Testing", CheckStatus.PENDING, "Load test passed"),
            ChecklistItem("Security Scan", "Security", CheckStatus.PENDING, "No vulnerabilities"),
        ]
        self.checklists.append(checklist)
        return checklist

    def update_item(self, checklist: ProductionChecklist, item_name: str, status: CheckStatus, notes: str = ""):
        for item in checklist.items:
            if item.name == item_name:
                item.status = status
                item.notes = notes
                break

    def get_readiness_score(self, checklist: ProductionChecklist) -> float:
        passed = sum(1 for item in checklist.items if item.status == CheckStatus.PASSED)
        return passed / len(checklist.items) if checklist.items else 0

    def is_ready_for_launch(self, checklist: ProductionChecklist) -> bool:
        return self.get_readiness_score(checklist) >= 0.9

    def generate_report(self, checklist: ProductionChecklist) -> str:
        report = [
            f"Production Readiness Report: {checklist.model_name} v{checklist.version}",
            "=" * 60,
            f"Readiness Score: {self.get_readiness_score(checklist):.1%}",
            ""
        ]

        categories = {}
        for item in checklist.items:
            if item.category not in categories:
                categories[item.category] = []
            categories[item.category].append(item)

        for category, items in categories.items():
            report.append(f"\n{category}:")
            for item in items:
                status_icon = {"passed": "PASS", "failed": "FAIL", "pending": "TODO", "skipped": "SKIP"}
                report.append(f"  [{status_icon[item.status.value]}] {item.name}")
                if item.notes:
                    report.append(f"       {item.notes}")

        report.append(f"\nReady for Launch: {'YES' if self.is_ready_for_launch(checklist) else 'NO'}")
        return "\n".join(report)


# Usage
checker = ProductionReadinessChecker()
checklist = checker.create_checklist("churn-predictor", "2.0.0")

checker.update_item(checklist, "Model Performance", CheckStatus.PASSED, "Accuracy: 0.92")
checker.update_item(checklist, "Model Fairness", CheckStatus.PASSED, "No bias detected")
checker.update_item(checklist, "Latency SLA", CheckStatus.PASSED, "P99: 45ms")
checker.update_item(checklist, "Data Validation", CheckStatus.PASSED, "All checks passed")
checker.update_item(checklist, "Monitoring", CheckStatus.PASSED, "Dashboards configured")

print(checker.generate_report(checklist))
print(f"\nReady: {checker.is_ready_for_launch(checklist)}")

Deployment Validation

# deployment_validation.py
import requests
import time
import json
from typing import Dict, List
from dataclasses import dataclass

@dataclass
class ValidationResult:
    endpoint: str
    status: str
    latency_ms: float
    response_valid: bool
    error: str = ""

class DeploymentValidator:
    def __init__(self, base_url: str):
        self.base_url = base_url
        self.results: List[ValidationResult] = []

    def validate_health(self) -> ValidationResult:
        try:
            start = time.time()
            response = requests.get(f"{self.base_url}/health", timeout=5)
            latency = (time.time() - start) * 1000

            result = ValidationResult(
                endpoint="/health",
                status="passed" if response.status_code == 200 else "failed",
                latency_ms=latency,
                response_valid=response.status_code == 200
            )
        except Exception as e:
            result = ValidationResult(
                endpoint="/health",
                status="failed",
                latency_ms=0,
                response_valid=False,
                error=str(e)
            )

        self.results.append(result)
        return result

    def validate_prediction(self, test_input: Dict) -> ValidationResult:
        try:
            start = time.time()
            response = requests.post(
                f"{self.base_url}/predict",
                json=test_input,
                timeout=10
            )
            latency = (time.time() - start) * 1000

            is_valid = (
                response.status_code == 200 and
                "prediction" in response.json()
            )

            result = ValidationResult(
                endpoint="/predict",
                status="passed" if is_valid else "failed",
                latency_ms=latency,
                response_valid=is_valid
            )
        except Exception as e:
            result = ValidationResult(
                endpoint="/predict",
                status="failed",
                latency_ms=0,
                response_valid=False,
                error=str(e)
            )

        self.results.append(result)
        return result

    def run_load_test(self, endpoint: str, payload: Dict, num_requests: int = 100) -> Dict:
        latencies = []
        errors = 0

        for _ in range(num_requests):
            try:
                start = time.time()
                response = requests.post(
                    f"{self.base_url}{endpoint}",
                    json=payload,
                    timeout=10
                )
                latency = (time.time() - start) * 1000
                latencies.append(latency)

                if response.status_code != 200:
                    errors += 1
            except Exception:
                errors += 1

        latencies.sort()
        return {
            "total_requests": num_requests,
            "errors": errors,
            "error_rate": errors / num_requests,
            "avg_latency_ms": sum(latencies) / len(latencies) if latencies else 0,
            "p50_latency_ms": latencies[len(latencies) // 2] if latencies else 0,
            "p95_latency_ms": latencies[int(len(latencies) * 0.95)] if latencies else 0,
            "p99_latency_ms": latencies[int(len(latencies) * 0.99)] if latencies else 0,
        }

    def generate_report(self) -> str:
        report = ["Deployment Validation Report", "=" * 40]
        for result in self.results:
            report.append(f"[{result.status.upper()}] {result.endpoint}: {result.latency_ms:.1f}ms")
        passed = sum(1 for r in self.results if r.status == "passed")
        report.append(f"\nTotal: {passed}/{len(self.results)} checks passed")
        return "\n".join(report)


# Usage
validator = DeploymentValidator("http://localhost:8000")
validator.validate_health()
validator.validate_prediction({"features": [1.0, 2.0, 3.0]})
load_results = validator.run_load_test("/predict", {"features": [1.0, 2.0, 3.0]}, 50)
print(f"Load test: {load_results['p99_latency_ms']:.1f}ms P99")

Follow-Up Questions

How do you implement gradual rollouts for ML models?
What monitoring dashboards are essential for production ML?
How would you handle emergency model rollback?
What documentation is required for production ML systems?