CI/CD for ML
Difficulty: Senior Level | Companies: Google, Meta, Netflix, Uber, Stripe
ML CI/CD Challenges
ML systems require additional testing beyond traditional software: data validation, model quality, and performance benchmarks.
βΉοΈ
Google's ML CI/CD pipeline runs 10,000+ tests per model deployment, including fairness and bias checks.
GitHub Actions Pipeline
# .github/workflows/ml-ci-cd.yml
name: ML CI/CD Pipeline
on:
push:
branches: [main, develop]
pull_request:
branches: [main]
env:
PYTHON_VERSION: '3.9'
MLFLOW_TRACKING_URI: ${{ secrets.MLFLOW_TRACKING_URI }}
jobs:
data-validation:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install dependencies
run: pip install -r requirements.txt
- name: Validate data schema
run: python scripts/validate_schema.py --input data/raw/latest.parquet --schema schemas/input.json
- name: Check data quality
run: python scripts/check_data_quality.py --input data/raw/latest.parquet
- name: Upload validation results
uses: actions/upload-artifact@v3
with:
name: validation-results
path: results/validation.json
unit-tests:
runs-on: ubuntu-latest
needs: data-validation
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install dependencies
run: pip install -r requirements.txt
- name: Run unit tests
run: pytest tests/unit -v --cov=src --cov-report=xml
- name: Upload coverage
uses: codecov/codecov-action@v3
with:
file: coverage.xml
integration-tests:
runs-on: ubuntu-latest
needs: unit-tests
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install dependencies
run: pip install -r requirements.txt
- name: Run integration tests
run: pytest tests/integration -v
env:
DATABASE_URL: ${{ secrets.DATABASE_URL }}
REDIS_URL: ${{ secrets.REDIS_URL }}
model-training:
runs-on: ubuntu-latest
needs: integration-tests
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Install dependencies
run: pip install -r requirements.txt
- name: Train model
run: python scripts/train.py --config configs/training.json
- name: Evaluate model
run: python scripts/evaluate.py --model artifacts/model.pkl --test-data data/test/
- name: Upload model artifact
uses: actions/upload-artifact@v3
with:
name: model-artifact
path: artifacts/model.pkl
model-validation:
runs-on: ubuntu-latest
needs: model-training
steps:
- uses: actions/checkout@v3
- name: Download model artifact
uses: actions/download-artifact@v3
with:
name: model-artifact
path: artifacts/
- name: Validate model performance
run: python scripts/validate_model.py --model artifacts/model.pkl --threshold 0.85
- name: Check model fairness
run: python scripts/check_fairness.py --model artifacts/model.pkl --data data/test/
- name: Security scan
run: python scripts/security_scan.py --model artifacts/model.pkl
build-and-push:
runs-on: ubuntu-latest
needs: model-validation
if: github.ref == 'refs/heads/main'
steps:
- uses: actions/checkout@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Login to Docker Hub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- name: Build and push
uses: docker/build-push-action@v4
with:
context: .
push: true
tags: ml-server:${{ github.sha }},ml-server:latest
deploy-staging:
runs-on: ubuntu-latest
needs: build-and-push
if: github.ref == 'refs/heads/main'
steps:
- name: Deploy to staging
run: |
kubectl set image deployment/ml-server \
ml-server=ml-server:${{ github.sha }} \
--namespace=staging
- name: Run smoke tests
run: python scripts/smoke_tests.py --endpoint https://staging.ml-api.example.com
- name: Run load tests
run: python scripts/load_tests.py --endpoint https://staging.ml-api.example.com --rps 100 --duration 60
deploy-production:
runs-on: ubuntu-latest
needs: deploy-staging
if: github.ref == 'refs/heads/main'
environment: production
steps:
- name: Deploy to production
run: |
kubectl set image deployment/ml-server \
ml-server=ml-server:${{ github.sha }} \
--namespace=production
- name: Verify deployment
run: python scripts/verify_deployment.py --endpoint https://ml-api.example.com
- name: Notify team
uses: slackapi/slack-github-action@v1
with:
payload: |
{
"text": "ML model deployed to production: ${{ github.sha }}"
}
Model Testing Framework
# model_testing.py
import pytest
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from typing import Dict, Any, Callable
from dataclasses import dataclass
@dataclass
class TestResult:
test_name: str
passed: bool
message: str
metrics: Dict[str, float]
class ModelTestSuite:
def __init__(self, model, X_test: pd.DataFrame, y_test: pd.Series):
self.model = model
self.X_test = X_test
self.y_test = y_test
self.results: List[TestResult] = []
def test_accuracy_threshold(self, threshold: float = 0.8):
y_pred = self.model.predict(self.X_test)
accuracy = accuracy_score(self.y_test, y_pred)
passed = accuracy >= threshold
self.results.append(TestResult(
test_name="accuracy_threshold",
passed=passed,
message=f"Accuracy {accuracy:.4f} {'>=' if passed else '<'} threshold {threshold}",
metrics={"accuracy": accuracy, "threshold": threshold}
))
return self
def test_f1_score(self, threshold: float = 0.75):
y_pred = self.model.predict(self.X_test)
f1 = f1_score(self.y_test, y_pred, average="weighted")
passed = f1 >= threshold
self.results.append(TestResult(
test_name="f1_score",
passed=passed,
message=f"F1 score {f1:.4f} {'>=' if passed else '<'} threshold {threshold}",
metrics={"f1_score": f1, "threshold": threshold}
))
return self
def test_prediction_distribution(self, max_std: float = 0.5):
y_pred = self.model.predict(self.X_test)
pred_std = np.std(y_pred)
passed = pred_std <= max_std
self.results.append(TestResult(
test_name="prediction_distribution",
passed=passed,
message=f"Prediction std {pred_std:.4f} {'<=' if passed else '>'} max {max_std}",
metrics={"pred_std": pred_std, "max_std": max_std}
))
return self
def test_no_nan_predictions(self):
y_pred = self.model.predict(self.X_test)
has_nan = np.isnan(y_pred).any()
self.results.append(TestResult(
test_name="no_nan_predictions",
passed=not has_nan,
message="No NaN predictions" if not has_nan else "Found NaN predictions",
metrics={"nan_count": int(np.isnan(y_pred).sum())}
))
return self
def test_feature_importance(self, min_importance: float = 0.01):
if hasattr(self.model, 'feature_importances_'):
importances = self.model.feature_importances_
low_importance = np.sum(importances < min_importance)
passed = low_importance == 0
self.results.append(TestResult(
test_name="feature_importance",
passed=passed,
message=f"{low_importance} features below importance threshold",
metrics={"low_importance_count": low_importance}
))
return self
def run_all(self) -> List[TestResult]:
self.test_accuracy_threshold()
self.test_f1_score()
self.test_prediction_distribution()
self.test_no_nan_predictions()
self.test_feature_importance()
return self.results
def generate_report(self) -> str:
report_lines = ["Model Test Report", "=" * 50]
for result in self.results:
status = "PASS" if result.passed else "FAIL"
report_lines.append(f"[{status}] {result.test_name}: {result.message}")
passed_count = sum(1 for r in self.results if r.passed)
report_lines.append(f"\nTotal: {passed_count}/{len(self.results)} tests passed")
return "\n".join(report_lines)
# Usage
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
test_suite = ModelTestSuite(model, pd.DataFrame(X_test), pd.Series(y_test))
test_suite.run_all()
print(test_suite.generate_report())
Follow-Up Questions
- How would you implement canary deployments for ML models?
- What rollback strategies work best for ML systems?
- How do you handle model versioning in CI/CD pipelines?
- What security considerations apply to ML deployment pipelines?