Advanced Ensemble Methods
Ensembles combine multiple models to produce predictions stronger than any individual. The best Kaggle solutions and production systems almost always use ensemble methods – understanding them deeply is essential for any data scientist.
Ensemble Methods Comparison
The Ensemble Principle
The wisdom of crowds applies to models: diverse, reasonably accurate models, when combined, reduce variance and bias simultaneously. The key is diversity – models that make different errors.
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import (
RandomForestClassifier, GradientBoostingClassifier,
VotingClassifier, StackingClassifier, BaggingClassifier,
AdaBoostClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
Generate Dataset
X, y = make_classification(
n_samples=3000, n_features=30, n_informative=15,
n_redundant=5, flip_y=0.03, random_state=42
)
X = pd.DataFrame(X, columns=[f'f{i}' for i in range(30)])
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print(f"Dataset: {X.shape}, class balance: {np.bincount(y)/len(y)}")
Bagging Methods
Bagging trains multiple models on bootstrap samples and averages their predictions.
# Bagging with decision trees
bagging = BaggingClassifier(
n_estimators=100, max_samples=0.8, max_features=0.8,
random_state=42, n_jobs=-1
)
scores_bag = cross_val_score(bagging, X, y, cv=cv, scoring='roc_auc')
print(f"Bagging AUC: {scores_bag.mean():.4f} ± {scores_bag.std():.4f}")
# Random Forest – bagging with feature randomization
rf = RandomForestClassifier(
n_estimators=200, max_depth=10, min_samples_split=5,
random_state=42, n_jobs=-1
)
scores_rf = cross_val_score(rf, X, y, cv=cv, scoring='roc_auc')
print(f"Random Forest AUC: {scores_rf.mean():.4f} ± {scores_rf.std():.4f}")
Boosting Methods
Boosting trains models sequentially, each correcting the errors of the previous.
# Gradient Boosting (sklearn)
gb = GradientBoostingClassifier(
n_estimators=200, learning_rate=0.1, max_depth=5,
subsample=0.8, random_state=42
)
scores_gb = cross_val_score(gb, X, y, cv=cv, scoring='roc_auc')
print(f"Gradient Boosting AUC: {scores_gb.mean():.4f} ± {scores_gb.std():.4f}")
# AdaBoost
ada = AdaBoostClassifier(
n_estimators=200, learning_rate=0.1, random_state=42
)
scores_ada = cross_val_score(ada, X, y, cv=cv, scoring='roc_auc')
print(f"AdaBoost AUC: {scores_ada.mean():.4f} ± {scores_ada.std():.4f}")
XGBoost
XGBoost is optimized for speed and performance with regularization and handling missing values. It minimizes a regularized objective that combines loss and complexity:
where is the loss function and penalizes tree complexity with leaves and weights .
xgb_model = xgb.XGBClassifier(
n_estimators=300, learning_rate=0.05, max_depth=6,
subsample=0.8, colsample_bytree=0.8,
reg_alpha=0.1, reg_lambda=1.0,
min_child_weight=3, gamma=0.1,
random_state=42, eval_metric='logloss', n_jobs=-1
)
scores_xgb = cross_val_score(xgb_model, X, y, cv=cv, scoring='roc_auc')
print(f"XGBoost AUC: {scores_xgb.mean():.4f} ± {scores_xgb.std():.4f}")
# XGBoost with early stopping
X_train, X_val = X.iloc[:2400], X.iloc[2400:]
y_train, y_val = y[:2400], y[2400:]
xgb_early = xgb.XGBClassifier(
n_estimators=1000, learning_rate=0.05, max_depth=6,
early_stopping_rounds=50, random_state=42, eval_metric='logloss'
)
xgb_early.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
verbose=False
)
print(f"Best iteration: {xgb_early.best_iteration}")
LightGBM
LightGBM uses histogram-based splitting for faster training on large datasets.
lgb_model = lgb.LGBMClassifier(
n_estimators=300, learning_rate=0.05, max_depth=8,
num_leaves=63, subsample=0.8, colsample_bytree=0.8,
reg_alpha=0.1, reg_lambda=1.0,
min_child_samples=20, random_state=42, n_jobs=-1, verbose=-1
)
scores_lgb = cross_val_score(lgb_model, X, y, cv=cv, scoring='roc_auc')
print(f"LightGBM AUC: {scores_lgb.mean():.4f} ± {scores_lgb.std():.4f}")
# LightGBM with categorical features
X_lgb = X.copy()
X_lgb['cat_feature'] = np.random.choice(['A', 'B', 'C'], len(X))
lgb_model_cat = lgb.LGBMClassifier(n_estimators=100, random_state=42, verbose=-1)
lgb_model_cat.fit(X_lgb, y, categorical_feature=['cat_feature'])
print("LightGBM handles categorical features natively")
CatBoost
CatBoost excels with categorical features and reduces overfitting through ordered boosting.
from catboost import CatBoostClassifier, Pool
cat_model = CatBoostClassifier(
iterations=300, learning_rate=0.05, depth=6,
l2_leaf_reg=3, random_seed=42, verbose=0
)
# CatBoost with native categorical support
X_cat = X.copy()
X_cat['category'] = np.random.choice(['electronics', 'clothing', 'food'], len(X))
X_cat['subcategory'] = np.random.choice(['a', 'b', 'c', 'd'], len(X))
cat_features = ['category', 'subcategory']
cat_model.fit(X_cat, y, cat_features=cat_features, verbose=0)
scores_cat = cross_val_score(cat_model, X_cat, y, cv=cv, scoring='roc_auc')
print(f"CatBoost AUC: {scores_cat.mean():.4f} ± {scores_cat.std():.4f}")
Voting Ensembles
Combining models through weighted or hard voting.
# Hard voting (majority rule)
voting_hard = VotingClassifier(
estimators=[
('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
('xgb', xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss'))
],
voting='hard'
)
scores_vh = cross_val_score(voting_hard, X, y, cv=cv, scoring='accuracy')
print(f"Hard Voting Accuracy: {scores_vh.mean():.4f}")
# Soft voting (probability averaging)
voting_soft = VotingClassifier(
estimators=[
('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
('xgb', xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss'))
],
voting='soft',
weights=[1, 2, 3] # Weight XGBoost more heavily
)
scores_vs = cross_val_score(voting_soft, X, y, cv=cv, scoring='roc_auc')
print(f"Soft Voting AUC: {scores_vs.mean():.4f}")
Stacking Ensembles
Stacking trains a meta-learner on the predictions of base models.
# Simple stacking
stacking = StackingClassifier(
estimators=[
('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
('xgb', xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss')),
('lgb', lgb.LGBMClassifier(n_estimators=100, random_state=42, verbose=-1))
],
final_estimator=LogisticRegression(C=1.0),
cv=5, n_jobs=-1, passthrough=False
)
scores_stack = cross_val_score(stacking, X, y, cv=cv, scoring='roc_auc')
print(f"Stacking AUC: {scores_stack.mean():.4f} ± {scores_stack.std():.4f}")
# Stacking with passthrough (includes original features in meta-learner)
stacking_passthrough = StackingClassifier(
estimators=[
('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
('xgb', xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss'))
],
final_estimator=GradientBoostingClassifier(n_estimators=50, random_state=42),
cv=5, passthrough=True
)
scores_pt = cross_val_score(stacking_passthrough, X, y, cv=cv, scoring='roc_auc')
print(f"Stacking (passthrough) AUC: {scores_pt.mean():.4f}")
Manual Blending
Blending is a simpler alternative to stacking using a holdout set.
from sklearn.model_selection import train_test_split
# Split into train, blend, and test sets
X_train, X_blend, y_train, y_blend = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Train base models on training set
base_models = {
'rf': RandomForestClassifier(n_estimators=100, random_state=42),
'gb': GradientBoostingClassifier(n_estimators=100, random_state=42),
'xgb': xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss')
}
blend_features = []
for name, model in base_models.items():
model.fit(X_train, y_train)
blend_pred = model.predict_proba(X_blend)[:, 1]
blend_features.append(blend_pred)
print(f"{name} blend AUC: {roc_auc_score(y_blend, blend_pred):.4f}")
# Stack blend predictions
X_meta = np.column_stack(blend_features)
# Train meta-learner
meta_learner = LogisticRegression(C=1.0)
meta_learner.fit(X_meta, y_blend)
# Final predictions
from sklearn.metrics import roc_auc_score
final_pred = meta_learner.predict_proba(X_meta)[:, 1]
print(f"Blending AUC: {roc_auc_score(y_blend, final_pred):.4f}")
Advanced Stacking with Out-of-Fold Predictions
from sklearn.model_selection import KFold
def stacking_oof(X, y, models, meta_model, n_folds=5):
"""Out-of-fold stacking for robust generalization."""
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
oof_predictions = np.zeros((len(X), len(models)))
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y[train_idx], y[val_idx]
for i, (name, model) in enumerate(models):
model.fit(X_train, y_train)
oof_predictions[val_idx, i] = model.predict_proba(X_val)[:, 1]
# Train meta-learner on OOF predictions
meta_learner = meta_model
meta_learner.fit(oof_predictions, y)
# Get CV score
meta_pred = meta_learner.predict_proba(oof_predictions)[:, 1]
score = roc_auc_score(y, meta_pred)
return meta_learner, oof_predictions, score
models = [
('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
('xgb', xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss'))
]
meta_learner, oof_preds, score = stacking_oof(X, y, models, LogisticRegression())
print(f"OOF Stacking AUC: {score:.4f}")
Comparing All Approaches
results = {
'Random Forest': scores_rf.mean(),
'Gradient Boosting': scores_gb.mean(),
'XGBoost': scores_xgb.mean(),
'LightGBM': scores_lgb.mean(),
'Soft Voting': scores_vs.mean(),
'Stacking': scores_stack.mean(),
}
results_df = pd.DataFrame.from_dict(results, orient='index', columns=['AUC'])
results_df = results_df.sort_values('AUC', ascending=False)
print("Model Comparison:")
print(results_df.to_string())
Best Practices
- Diversity is key – combine models with different inductive biases
- Use OOF predictions – prevents leakage in stacking
- XGBoost/LightGBM/CatBoost – start with these; they dominate tabular data
- Don't over-ensemble – 3-5 well-tuned models often beat 20 mediocre ones
- Weight by performance – give better models more influence in voting
- Validate carefully – ensemble gains can be illusory without proper CV
Summary
Ensemble methods – bagging, boosting, stacking, and blending – are the most reliable way to maximize predictive performance. XGBoost, LightGBM, and CatBoost are the workhorses of tabular data, while stacking combines their strengths. Master these techniques and you'll consistently build top-performing models.