Interview Question (Hard) β Asked at: Google, Microsoft, Amazon, Netflix, Stripe
"How do you test ML systems beyond traditional software testing? Design a comprehensive testing strategy that covers data validation, model performance, fairness, and production behavior."
ML Testing Overview
ML systems require specialized testing beyond traditional software testing. Bugs in ML systems can manifest as subtle performance degradation rather than explicit failures.
ML Testing Pyramid
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β ML Testing Pyramid β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
β β
β βββββββββββ β
β β Model β Production validation β
β βTesting β A/B testing, shadow mode β
β ββ΄ββββββββββ΄β β
β βββββββββββββββββ β
β β Integration β Pipeline tests β
β β Tests β End-to-end workflows β
β ββ΄ββββββββββββββββ΄β β
β βββββββββββββββββββββββ β
β β Data Validation β Schema, distribution β
β β Tests β quality checks β
β ββ΄ββββββββββββββββββββββ΄β β
β βββββββββββββββββββββββββββββ β
β β Unit Tests β Feature transforms β
β β β Preprocessing β
β ββ΄ββββββββββββββββββββββββββββ΄β β
β βββββββββββββββββββββββββββββββββββ β
β β Code Quality Tests β Linting, type β
β β β checking β
β βββββββββββββββββββββββββββββββββββ β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Data Validation Tests
Great Expectations Integration
import great_expectations as ge
import pandas as pd
import json
from datetime import datetime
from typing import Dict, List, Optional
class MLDataValidator:
"""Comprehensive data validation for ML pipelines."""
def __init__(self, reference_data: pd.DataFrame = None):
self.reference_data = reference_data
self.validation_results = []
def validate_schema(self, df: pd.DataFrame,
expected_schema: Dict) -> Dict:
"""Validate DataFrame schema."""
results = {
'test': 'schema_validation',
'passed': True,
'errors': []
}
# Check columns exist
expected_columns = set(expected_schema.get('columns', []))
actual_columns = set(df.columns)
missing_columns = expected_columns - actual_columns
extra_columns = actual_columns - expected_columns
if missing_columns:
results['passed'] = False
results['errors'].append(
f"Missing columns: {missing_columns}"
)
# Check data types
for col, expected_type in expected_schema.get('dtypes', {}).items():
if col in df.columns:
actual_type = str(df[col].dtype)
if actual_type != expected_type:
results['passed'] = False
results['errors'].append(
f"Column {col}: expected {expected_type}, got {actual_type}"
)
# Check row count
if 'min_rows' in expected_schema:
if len(df) < expected_schema['min_rows']:
results['passed'] = False
results['errors'].append(
f"Row count {len(df)} below minimum {expected_schema['min_rows']}"
)
self.validation_results.append(results)
return results
def validate_statistics(self, df: pd.DataFrame,
tolerance: float = 0.1) -> Dict:
"""Validate statistical properties against reference."""
if self.reference_data is None:
raise ValueError("Reference data not provided")
results = {
'test': 'statistical_validation',
'passed': True,
'feature_results': {}
}
for column in df.select_dtypes(include=['number']).columns:
if column not in self.reference_data.columns:
continue
ref_stats = self.reference_data[column].describe()
curr_stats = df[column].describe()
feature_result = {
'passed': True,
'deviations': []
}
# Check mean
ref_mean = ref_stats['mean']
curr_mean = curr_stats['mean']
mean_deviation = abs(curr_mean - ref_mean) / abs(ref_mean)
if mean_deviation > tolerance:
feature_result['passed'] = False
feature_result['deviations'].append(
f"Mean deviation: {mean_deviation:.4f} > {tolerance}"
)
# Check standard deviation
ref_std = ref_stats['std']
curr_std = curr_stats['std']
std_deviation = abs(curr_std - ref_std) / ref_std
if std_deviation > tolerance * 2:
feature_result['passed'] = False
feature_result['deviations'].append(
f"Std deviation: {std_deviation:.4f} > {tolerance * 2}"
)
results['feature_results'][column] = feature_result
if not feature_result['passed']:
results['passed'] = False
self.validation_results.append(results)
return results
def validate_distributions(self, df: pd.DataFrame,
significance_level: float = 0.05) -> Dict:
"""Validate distributions using statistical tests."""
from scipy.stats import ks_2samp, chi2_contingency
results = {
'test': 'distribution_validation',
'passed': True,
'feature_results': {}
}
for column in df.columns:
if column not in self.reference_data.columns:
continue
if df[column].dtype in ['int64', 'float64']:
# KS test for numerical features
stat, p_value = ks_2samp(
self.reference_data[column].dropna(),
df[column].dropna()
)
results['feature_results'][column] = {
'test': 'ks_test',
'statistic': float(stat),
'p_value': float(p_value),
'passed': p_value > significance_level
}
if p_value < significance_level:
results['passed'] = False
else:
# Chi-squared test for categorical features
ref_counts = self.reference_data[column].value_counts()
curr_counts = df[column].value_counts()
# Align categories
all_categories = set(ref_counts.index) | set(curr_counts.index)
ref_aligned = [ref_counts.get(cat, 0) for cat in all_categories]
curr_aligned = [curr_counts.get(cat, 0) for cat in all_categories]
if sum(ref_aligned) > 0 and sum(curr_aligned) > 0:
contingency = [ref_aligned, curr_aligned]
chi2, p_value, _, _ = chi2_contingency(contingency)
results['feature_results'][column] = {
'test': 'chi_squared',
'statistic': float(chi2),
'p_value': float(p_value),
'passed': p_value > significance_level
}
if p_value < significance_level:
results['passed'] = False
self.validation_results.append(results)
return results
def validate_ml_specific(self, df: pd.DataFrame,
target_column: str,
feature_columns: List[str]) -> Dict:
"""ML-specific validation checks."""
results = {
'test': 'ml_specific_validation',
'passed': True,
'checks': []
}
# Check target variable
if target_column in df.columns:
# Class balance
class_counts = df[target_column].value_counts()
class_ratio = class_counts.min() / class_counts.max()
results['checks'].append({
'check': 'class_balance',
'ratio': float(class_ratio),
'passed': class_ratio > 0.1, # At least 10% minority class
'message': f"Class ratio: {class_ratio:.4f}"
})
# Check for label leakage
for col in feature_columns:
if col in df.columns:
correlation = abs(df[col].corr(df[target_column]))
if correlation > 0.95:
results['checks'].append({
'check': 'label_leakage',
'feature': col,
'correlation': float(correlation),
'passed': False,
'message': f"High correlation with target: {correlation:.4f}"
})
results['passed'] = False
# Check feature correlations
if len(feature_columns) > 1:
corr_matrix = df[feature_columns].corr().abs()
upper_tri = corr_matrix.where(
np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
)
high_corr_pairs = [
(col, row, corr_matrix.loc[row, col])
for col in upper_tri.columns
for row in upper_tri.index
if upper_tri.loc[row, col] > 0.95
]
results['checks'].append({
'check': 'feature_correlation',
'high_correlation_pairs': len(high_corr_pairs),
'passed': len(high_corr_pairs) == 0,
'message': f"Found {len(high_corr_pairs)} highly correlated feature pairs"
})
# Check for infinite values
inf_counts = np.isinf(df[feature_columns].select_dtypes(include=[np.number])).sum()
total_inf = inf_counts.sum()
results['checks'].append({
'check': 'infinite_values',
'count': int(total_inf),
'passed': total_inf == 0,
'message': f"Found {total_inf} infinite values"
})
self.validation_results.append(results)
return results
def generate_report(self) -> str:
"""Generate HTML validation report."""
html = """
<html>
<head>
<title>Data Validation Report</title>
<style>
body { font-family: Arial, sans-serif; margin: 20px; }
.passed { color: green; }
.failed { color: red; }
.warning { color: orange; }
table { border-collapse: collapse; width: 100%; }
th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
th { background-color: #f2f2f2; }
</style>
</head>
<body>
<h1>Data Validation Report</h1>
<p>Generated: {timestamp}</p>
""".format(timestamp=datetime.now().isoformat())
for result in self.validation_results:
status_class = "passed" if result['passed'] else "failed"
html += f"""
<h2 class="{status_class}">{result['test']} -
{'PASSED' if result['passed'] else 'FAILED'}</h2>
"""
html += "</body></html>"
return html
βΉοΈ
Data validation should be the first line of defense. Implement schema validation, statistical tests, and ML-specific checks to catch data issues before they affect model training.
Unit Testing for ML Components
Feature Transform Tests
import pytest
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
class TestFeatureTransforms:
"""Test suite for feature transformations."""
@pytest.fixture
def sample_data(self):
"""Create sample data for testing."""
np.random.seed(42)
return pd.DataFrame({
'user_id': range(1000),
'transaction_amount': np.random.exponential(100, 1000),
'timestamp': [
datetime.now() - timedelta(days=i)
for i in range(1000)
],
'category': np.random.choice(
['A', 'B', 'C', 'D'], 1000
),
'age': np.random.randint(18, 80, 1000)
})
def test_log_transform(self, sample_data):
"""Test log transformation handles edge cases."""
from my_module.features import log_transform
# Normal case
result = log_transform(sample_data['transaction_amount'])
assert (result >= 0).all(), "Log transform should be non-negative"
assert not np.any(np.isinf(result)), "Log transform should not produce inf"
# Edge case: zero values
zero_data = pd.Series([0, 1, 2, 3])
result = log_transform(zero_data)
assert result.iloc[0] == 0, "Log of zero should be zero with log1p"
# Edge case: negative values (should be clipped)
negative_data = pd.Series([-1, 0, 1, 2])
result = log_transform(negative_data)
assert not np.any(np.isnan(result)), "Should not produce NaN"
def test_categorical_encoding(self, sample_data):
"""Test categorical encoding."""
from my_module.features import encode_categorical
result = encode_categorical(sample_data['category'])
# Check output is numeric
assert result.dtype in ['int64', 'float64'], \
"Encoded categories should be numeric"
# Check all categories are encoded
assert len(result.unique()) == sample_data['category'].nunique(), \
"Should preserve number of categories"
def test_rolling_features(self, sample_data):
"""Test rolling window features."""
from my_module.features import compute_rolling_features
result = compute_rolling_features(
sample_data,
column='transaction_amount',
windows=[7, 30]
)
# Check columns exist
assert 'rolling_mean_7d' in result.columns
assert 'rolling_mean_30d' in result.columns
# Check no NaN for valid windows
assert not result['rolling_mean_7d'].isna().any()
# Check rolling mean is within valid range
assert (result['rolling_mean_7d'] >= 0).all()
def test_feature_interaction(self, sample_data):
"""Test feature interaction creation."""
from my_module.features import create_interactions
result = create_interactions(
sample_data,
columns=['transaction_amount', 'age']
)
# Check interaction column exists
assert 'transaction_amount_x_age' in result.columns
# Check calculation is correct
expected = (
sample_data['transaction_amount'] * sample_data['age']
)
np.testing.assert_array_almost_equal(
result['transaction_amount_x_age'].values,
expected.values
)
def test_time_features(self, sample_data):
"""Test time-based feature extraction."""
from my_module.features import extract_time_features
result = extract_time_features(sample_data, 'timestamp')
# Check time features exist
assert 'hour' in result.columns
assert 'dayofweek' in result.columns
assert 'month' in result.columns
# Check valid ranges
assert (result['hour'] >= 0).all() and (result['hour'] <= 23).all()
assert (result['dayofweek'] >= 0).all() and (result['dayofweek'] <= 6).all()
assert (result['month'] >= 1).all() and (result['month'] <= 12).all()
@pytest.mark.parametrize("invalid_input", [
None,
pd.DataFrame(),
pd.DataFrame({'missing_column': [1, 2, 3]}),
])
def test_invalid_inputs(self, invalid_input):
"""Test handling of invalid inputs."""
from my_module.features import process_features
with pytest.raises(ValueError):
process_features(invalid_input)
class TestModelInference:
"""Test suite for model inference."""
@pytest.fixture
def trained_model(self):
"""Load a trained model for testing."""
import joblib
return joblib.load('tests/fixtures/model.pkl')
@pytest.fixture
def sample_features(self):
"""Create sample features for prediction."""
return pd.DataFrame({
'feature_1': [1.0, 2.0, 3.0],
'feature_2': [0.5, 1.5, 2.5],
'feature_3': [10, 20, 30]
})
def test_prediction_shape(self, trained_model, sample_features):
"""Test prediction output shape."""
predictions = trained_model.predict(sample_features)
assert predictions.shape == (3,), \
f"Expected shape (3,), got {predictions.shape}"
def test_prediction_range(self, trained_model, sample_features):
"""Test predictions are in valid range."""
predictions = trained_model.predict(sample_features)
assert (predictions >= 0).all() and (predictions <= 1).all(), \
"Predictions should be in [0, 1] range"
def test_prediction_deterministic(self, trained_model, sample_features):
"""Test predictions are deterministic."""
pred1 = trained_model.predict(sample_features)
pred2 = trained_model.predict(sample_features)
np.testing.assert_array_almost_equal(pred1, pred2, decimal=10)
def test_batch_prediction(self, trained_model):
"""Test batch prediction consistency."""
single_features = pd.DataFrame({
'feature_1': [1.0],
'feature_2': [0.5],
'feature_3': [10]
})
batch_features = pd.DataFrame({
'feature_1': [1.0, 2.0, 3.0],
'feature_2': [0.5, 1.5, 2.5],
'feature_3': [10, 20, 30]
})
single_pred = trained_model.predict(single_features)[0]
batch_pred = trained_model.predict(batch_features)[0]
np.testing.assert_almost_equal(single_pred, batch_pred, decimal=10)
Integration Tests
Pipeline Integration Tests
import pytest
import tempfile
import json
from pathlib import Path
import pandas as pd
import numpy as np
from unittest.mock import Mock, patch
class TestMLPipelineIntegration:
"""Integration tests for ML pipeline."""
@pytest.fixture
def pipeline_config(self, tmp_path):
"""Create test pipeline configuration."""
config = {
'data_path': str(tmp_path / 'data'),
'model_output': str(tmp_path / 'models'),
'features': ['feature_1', 'feature_2', 'feature_3'],
'target': 'label',
'model_params': {
'n_estimators': 10,
'max_depth': 3
}
}
config_path = tmp_path / 'config.json'
with open(config_path, 'w') as f:
json.dump(config, f)
return config_path
@pytest.fixture
def sample_dataset(self, tmp_path):
"""Create sample dataset for testing."""
np.random.seed(42)
df = pd.DataFrame({
'feature_1': np.random.randn(1000),
'feature_2': np.random.randn(1000),
'feature_3': np.random.randint(0, 100, 1000),
'label': np.random.binomial(1, 0.3, 1000)
})
data_path = tmp_path / 'data'
data_path.mkdir(exist_ok=True)
df.to_parquet(data_path / 'training_data.parquet')
return data_path
def test_data_loading(self, sample_dataset):
"""Test data loading step."""
from my_module.pipeline import load_data
df = load_data(str(sample_dataset / 'training_data.parquet'))
assert len(df) == 1000
assert 'feature_1' in df.columns
assert 'label' in df.columns
def test_feature_engineering(self, sample_dataset):
"""Test feature engineering step."""
from my_module.pipeline import load_data, compute_features
df = load_data(str(sample_dataset / 'training_data.parquet'))
features = compute_features(df)
# Check new features created
assert len(features.columns) > len(df.columns)
# Check no NaN in features
assert not features.isna().any().any()
def test_model_training(self, sample_dataset, pipeline_config):
"""Test model training step."""
from my_module.pipeline import load_data, compute_features, train_model
with open(pipeline_config) as f:
config = json.load(f)
df = load_data(str(sample_dataset / 'training_data.parquet'))
features = compute_features(df)
X = features.drop(columns=[config['target']])
y = features[config['target']]
model, metrics = train_model(X, y, config['model_params'])
assert model is not None
assert 'auc_roc' in metrics
assert metrics['auc_roc'] > 0.5
def test_model_serialization(self, sample_dataset, pipeline_config, tmp_path):
"""Test model can be saved and loaded."""
from my_module.pipeline import (
load_data, compute_features, train_model, save_model
)
with open(pipeline_config) as f:
config = json.load(f)
df = load_data(str(sample_dataset / 'training_data.parquet'))
features = compute_features(df)
X = features.drop(columns=[config['target']])
y = features[config['target']]
model, _ = train_model(X, y, config['model_params'])
# Save model
model_path = tmp_path / 'model.pkl'
save_model(model, str(model_path))
# Load and verify
import joblib
loaded_model = joblib.load(model_path)
# Test predictions match
original_pred = model.predict(X.iloc[:10])
loaded_pred = loaded_model.predict(X.iloc[:10])
np.testing.assert_array_almost_equal(original_pred, loaded_pred)
def test_end_to_end_pipeline(self, sample_dataset, pipeline_config):
"""Test complete pipeline end-to-end."""
from my_module.pipeline import run_pipeline
results = run_pipeline(str(pipeline_config))
assert results['status'] == 'success'
assert 'metrics' in results
assert results['metrics']['auc_roc'] > 0.5
def test_pipeline_idempotency(self, sample_dataset, pipeline_config):
"""Test pipeline produces same results on re-run."""
from my_module.pipeline import run_pipeline
results1 = run_pipeline(str(pipeline_config))
results2 = run_pipeline(str(pipeline_config))
# Results should be identical (or very close)
assert abs(
results1['metrics']['auc_roc'] -
results2['metrics']['auc_roc']
) < 0.01
β οΈ
Integration tests should verify the entire pipeline produces correct outputs. Use deterministic data and fix random seeds to ensure reproducible test results.
Model Validation Tests
Comprehensive Model Validation
import numpy as np
import pandas as pd
from sklearn.metrics import (
roc_auc_score, precision_score, recall_score,
f1_score, confusion_matrix, classification_report
)
from typing import Dict, List, Tuple
import json
class ModelValidator:
"""Comprehensive model validation for production deployment."""
def __init__(self, model, reference_data: pd.DataFrame = None):
self.model = model
self.reference_data = reference_data
self.validation_results = []
def validate_performance(self, X_test: pd.DataFrame,
y_test: pd.Series,
thresholds: Dict[str, float]) -> Dict:
"""Validate model performance metrics."""
y_pred = self.model.predict(X_test)
y_proba = self.model.predict_proba(X_test)[:, 1] \
if hasattr(self.model, 'predict_proba') else y_pred
metrics = {
'auc_roc': float(roc_auc_score(y_test, y_proba)),
'precision': float(precision_score(y_test, y_pred)),
'recall': float(recall_score(y_test, y_pred)),
'f1_score': float(f1_score(y_test, y_pred)),
'confusion_matrix': confusion_matrix(y_test, y_pred).tolist()
}
# Check thresholds
passed = True
failed_metrics = []
for metric_name, threshold in thresholds.items():
if metric_name in metrics:
if metrics[metric_name] < threshold:
passed = False
failed_metrics.append({
'metric': metric_name,
'value': metrics[metric_name],
'threshold': threshold
})
result = {
'test': 'performance_validation',
'passed': passed,
'metrics': metrics,
'thresholds': thresholds,
'failed_metrics': failed_metrics
}
self.validation_results.append(result)
return result
def validate_fairness(self, X_test: pd.DataFrame,
y_test: pd.Series,
protected_attributes: List[str],
max_disparity: float = 0.1) -> Dict:
"""Validate model fairness across protected groups."""
y_pred = self.model.predict(X_test)
fairness_results = {
'test': 'fairness_validation',
'passed': True,
'protected_attribute_results': {}
}
for attr in protected_attributes:
if attr not in X_test.columns:
continue
groups = X_test[attr].unique()
group_metrics = {}
for group in groups:
mask = X_test[attr] == group
if mask.sum() < 100: # Skip small groups
continue
group_auc = roc_auc_score(
y_test[mask],
y_pred[mask]
)
group_metrics[group] = {
'auc': float(group_auc),
'count': int(mask.sum())
}
# Calculate disparity
if len(group_metrics) > 1:
aucs = [m['auc'] for m in group_metrics.values()]
max_group_auc = max(aucs)
min_group_auc = min(aucs)
disparity = max_group_auc - min_group_auc
fairness_results['protected_attribute_results'][attr] = {
'group_metrics': group_metrics,
'disparity': float(disparity),
'passed': disparity <= max_disparity
}
if disparity > max_disparity:
fairness_results['passed'] = False
self.validation_results.append(fairness_results)
return fairness_results
def validate_robustness(self, X_test: pd.DataFrame,
y_test: pd.Series,
noise_levels: List[float] = [0.01, 0.05, 0.1]) -> Dict:
"""Validate model robustness under perturbation."""
baseline_pred = self.model.predict(X_test)
baseline_auc = roc_auc_score(y_test, baseline_pred)
robustness_results = {
'test': 'robustness_validation',
'passed': True,
'baseline_auc': float(baseline_auc),
'noise_results': {}
}
for noise_level in noise_levels:
# Add noise to features
noise = np.random.normal(0, noise_level, X_test.shape)
X_noisy = X_test + noise
noisy_pred = self.model.predict(X_noisy)
noisy_auc = roc_auc_score(y_test, noisy_pred)
degradation = baseline_auc - noisy_auc
robustness_results['noise_results'][noise_level] = {
'auc': float(noisy_auc),
'degradation': float(degradation),
'passed': degradation < 0.05 # 5% max degradation
}
if degradation >= 0.05:
robustness_results['passed'] = False
self.validation_results.append(robustness_results)
return robustness_results
def validate_stability(self, X_test: pd.DataFrame,
n_splits: int = 5) -> Dict:
"""Validate model prediction stability."""
from sklearn.model_selection import KFold
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
fold_predictions = []
for train_idx, val_idx in kf.split(X_test):
X_fold = X_test.iloc[val_idx]
fold_pred = self.model.predict(X_fold)
fold_predictions.append(fold_pred)
# Calculate prediction variance
predictions_array = np.array(fold_predictions)
mean_pred = predictions_array.mean(axis=0)
std_pred = predictions_array.std(axis=0)
stability_results = {
'test': 'stability_validation',
'passed': True,
'mean_prediction_std': float(std_pred.mean()),
'max_prediction_std': float(std_pred.max()),
'coefficient_of_variation': float(
std_pred.mean() / mean_pred.mean()
) if mean_pred.mean() > 0 else float('inf')
}
# Check stability threshold
if stability_results['mean_prediction_std'] > 0.1:
stability_results['passed'] = False
self.validation_results.append(stability_results)
return stability_results
def validate_data_consistency(self, X_test: pd.DataFrame) -> Dict:
"""Validate model behavior on edge cases."""
edge_cases = []
# Test with zeros
X_zeros = pd.DataFrame(
np.zeros(X_test.shape),
columns=X_test.columns
)
zeros_pred = self.model.predict(X_zeros)
edge_cases.append({
'case': 'all_zeros',
'predictions': zeros_pred.tolist()
})
# Test with ones
X_ones = pd.DataFrame(
np.ones(X_test.shape),
columns=X_test.columns
)
ones_pred = self.model.predict(X_ones)
edge_cases.append({
'case': 'all_ones',
'predictions': ones_pred.tolist()
})
# Test with extreme values
X_extreme = X_test.copy()
for col in X_extreme.columns:
if X_extreme[col].dtype in ['int64', 'float64']:
X_extreme[col] = X_extreme[col].max() * 10
extreme_pred = self.model.predict(X_extreme)
edge_cases.append({
'case': 'extreme_values',
'predictions': extreme_pred.tolist()
})
consistency_results = {
'test': 'data_consistency_validation',
'passed': True,
'edge_cases': edge_cases,
'all_predictions_valid': all(
not np.isnan(pred).any() and not np.isinf(pred).any()
for case in edge_cases
for pred in [np.array(case['predictions'])]
)
}
if not consistency_results['all_predictions_valid']:
consistency_results['passed'] = False
self.validation_results.append(consistency_results)
return consistency_results
def generate_validation_report(self) -> Dict:
"""Generate comprehensive validation report."""
all_passed = all(r['passed'] for r in self.validation_results)
return {
'timestamp': datetime.now().isoformat(),
'overall_status': 'passed' if all_passed else 'failed',
'tests_run': len(self.validation_results),
'tests_passed': sum(1 for r in self.validation_results if r['passed']),
'tests_failed': sum(1 for r in self.validation_results if not r['passed']),
'results': self.validation_results,
'recommendations': self._generate_recommendations()
}
def _generate_recommendations(self) -> List[str]:
"""Generate recommendations based on validation results."""
recommendations = []
for result in self.validation_results:
if not result['passed']:
if result['test'] == 'performance_validation':
recommendations.append(
"Model performance below threshold. Consider feature engineering or hyperparameter tuning."
)
elif result['test'] == 'fairness_validation':
recommendations.append(
"Fairness constraints violated. Review training data for bias and consider fairness-aware algorithms."
)
elif result['test'] == 'robustness_validation':
recommendations.append(
"Model lacks robustness. Consider adversarial training or input validation."
)
return recommendations
Model Cards
Automated Model Card Generation
import json
from datetime import datetime
from typing import Dict, List, Optional
import pandas as pd
class ModelCardGenerator:
"""Generate comprehensive model cards."""
def __init__(self, model_name: str, model_version: str):
self.model_name = model_name
self.model_version = model_version
def generate_card(self,
model_description: str,
training_data: pd.DataFrame,
metrics: Dict,
fairness_results: Dict = None,
limitations: List[str] = None,
ethical_considerations: List[str] = None) -> Dict:
"""Generate complete model card."""
card = {
'model_details': {
'name': self.model_name,
'version': self.model_version,
'type': self._get_model_type(),
'description': model_description,
'owner': self._get_model_owner(),
'created_at': datetime.now().isoformat(),
},
'intended_use': {
'primary_use_cases': self._get_primary_use_cases(),
'out_of_scope_uses': self._get_out_of_scope_uses(),
'users': self._get_intended_users(),
},
'training_data': {
'description': self._get_data_description(training_data),
'size': len(training_data),
'features': list(training_data.columns),
'preprocessing': self._get_preprocessing_steps(),
'data_collection': self._get_data_collection_process(),
},
'evaluation_data': {
'description': self._get_eval_data_description(),
'metrics': metrics,
},
'performance_metrics': metrics,
'fairness_analysis': fairness_results or {},
'limitations': limitations or self._get_default_limitations(),
'ethical_considerations': ethical_considerations or self._get_ethical_considerations(),
'recommendations': self._generate_recommendations(metrics),
}
return card
def _get_model_type(self) -> str:
"""Determine model type."""
model_class = type(self.model).__name__
return model_class
def _get_model_owner(self) -> str:
"""Get model owner from configuration."""
return "ML Team"
def _get_primary_use_cases(self) -> List[str]:
"""Define primary use cases."""
return [
"Production inference for real-time predictions",
"Batch processing for offline analysis",
]
def _get_out_of_scope_uses(self) -> List[str]:
"""Define out-of-scope uses."""
return [
"Life-critical decisions without human oversight",
"Surveillance or profiling of individuals",
]
def _get_intended_users(self) -> List[str]:
"""Define intended users."""
return [
"ML engineers for model deployment",
"Data scientists for model analysis",
"Product managers for feature integration",
]
def _get_data_description(self, data: pd.DataFrame) -> str:
"""Generate data description."""
return f"""
Training dataset with {len(data)} samples and {len(data.columns)} features.
Data collected from production systems over the last 90 days.
Features include numerical and categorical variables.
"""
def _get_preprocessing_steps(self) -> List[str]:
"""List preprocessing steps."""
return [
"Missing value imputation using median",
"Categorical encoding using one-hot encoding",
"Feature scaling using standard scaler",
]
def _get_data_collection_process(self) -> str:
"""Describe data collection process."""
return """
Data collected from production user interactions.
All data anonymized and compliant with privacy regulations.
No personally identifiable information (PII) included.
"""
def _get_eval_data_description(self) -> str:
"""Describe evaluation data."""
return """
Evaluation performed on held-out test set representing
recent production data distribution.
"""
def _get_default_limitations(self) -> List[str]:
"""Get default model limitations."""
return [
"Model trained on historical data, may not generalize to new patterns",
"Performance may degrade on underrepresented groups",
"Not suitable for extreme edge cases outside training distribution",
]
def _get_ethical_considerations(self) -> List[str]:
"""Get ethical considerations."""
return [
"Model reviewed for fairness across protected attributes",
"Regular bias audits scheduled quarterly",
"Human oversight required for high-stakes decisions",
]
def _generate_recommendations(self, metrics: Dict) -> List[str]:
"""Generate recommendations based on metrics."""
recommendations = []
if metrics.get('auc_roc', 0) < 0.9:
recommendations.append(
"Consider feature engineering or model tuning to improve performance"
)
if metrics.get('fairness_disparity', 0) > 0.1:
recommendations.append(
"Address fairness disparities through data balancing or algorithmic fairness techniques"
)
recommendations.append(
"Monitor model performance in production and schedule regular retraining"
)
return recommendations
def export_card(self, card: Dict, format: str = 'markdown') -> str:
"""Export model card in specified format."""
if format == 'markdown':
return self._to_markdown(card)
elif format == 'json':
return json.dumps(card, indent=2, default=str)
else:
raise ValueError(f"Unsupported format: {format}")
def _to_markdown(self, card: Dict) -> str:
"""Convert model card to markdown."""
md = f"""
# Model Card: {card['model_details']['name']}
**Version:** {card['model_details']['version']}
**Description:** {card['model_details']['description']}
## Intended Use
**Primary Use Cases:**
{chr(10).join(f"- {use}" for use in card['intended_use']['primary_use_cases'])}
**Out-of-Scope Uses:**
{chr(10).join(f"- {use}" for use in card['intended_use']['out_of_scope_uses'])}
## Training Data
{card['training_data']['description']}
**Size:** {card['training_data']['size']} samples
**Features:** {', '.join(card['training_data']['features'][:10])}{'...' if len(card['training_data']['features']) > 10 else ''}
## Performance Metrics
{self._format_metrics(card['performance_metrics'])}
## Limitations
{chr(10).join(f"- {limitation}" for limitation in card['limitations'])}
## Ethical Considerations
{chr(10).join(f"- {consideration}" for consideration in card['ethical_considerations'])}
## Recommendations
{chr(10).join(f"- {rec}" for rec in card['recommendations'])}
"""
return md
def _format_metrics(self, metrics: Dict) -> str:
"""Format metrics for markdown."""
lines = []
for metric, value in metrics.items():
if isinstance(value, float):
lines.append(f"- **{metric}:** {value:.4f}")
else:
lines.append(f"- **{metric}:** {value}")
return chr(10).join(lines)
βΉοΈ
Model cards are essential for responsible AI deployment. They provide transparency about model capabilities, limitations, and intended use. Automate model card generation to ensure consistency across all models.
Summary
ML testing requires a comprehensive approach:
- Data Validation: Schema, statistics, and distribution checks
- Unit Tests: Feature transforms and model components
- Integration Tests: End-to-end pipeline validation
- Model Validation: Performance, fairness, and robustness
- Model Cards: Documentation for responsible AI
Implement testing at every stage of the ML lifecycle to ensure reliable production systems.