Advanced Ensemble Methods

Ensembles combine multiple models to produce predictions stronger than any individual. The best Kaggle solutions and production systems almost always use ensemble methods – understanding them deeply is essential for any data scientist.

Ensemble Methods Comparison

The Ensemble Principle

The wisdom of crowds applies to models: diverse, reasonably accurate models, when combined, reduce variance and bias simultaneously. The key is diversity – models that make different errors.

import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    VotingClassifier, StackingClassifier, BaggingClassifier,
    AdaBoostClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

Generate Dataset

X, y = make_classification(
    n_samples=3000, n_features=30, n_informative=15,
    n_redundant=5, flip_y=0.03, random_state=42
)
X = pd.DataFrame(X, columns=[f'f{i}' for i in range(30)])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print(f"Dataset: {X.shape}, class balance: {np.bincount(y)/len(y)}")

Bagging Methods

Bagging trains multiple models on bootstrap samples and averages their predictions.

# Bagging with decision trees
bagging = BaggingClassifier(
    n_estimators=100, max_samples=0.8, max_features=0.8,
    random_state=42, n_jobs=-1
)
scores_bag = cross_val_score(bagging, X, y, cv=cv, scoring='roc_auc')
print(f"Bagging AUC: {scores_bag.mean():.4f} ± {scores_bag.std():.4f}")

# Random Forest – bagging with feature randomization
rf = RandomForestClassifier(
    n_estimators=200, max_depth=10, min_samples_split=5,
    random_state=42, n_jobs=-1
)
scores_rf = cross_val_score(rf, X, y, cv=cv, scoring='roc_auc')
print(f"Random Forest AUC: {scores_rf.mean():.4f} ± {scores_rf.std():.4f}")

Boosting Methods

Boosting trains models sequentially, each correcting the errors of the previous.

# Gradient Boosting (sklearn)
gb = GradientBoostingClassifier(
    n_estimators=200, learning_rate=0.1, max_depth=5,
    subsample=0.8, random_state=42
)
scores_gb = cross_val_score(gb, X, y, cv=cv, scoring='roc_auc')
print(f"Gradient Boosting AUC: {scores_gb.mean():.4f} ± {scores_gb.std():.4f}")

# AdaBoost
ada = AdaBoostClassifier(
    n_estimators=200, learning_rate=0.1, random_state=42
)
scores_ada = cross_val_score(ada, X, y, cv=cv, scoring='roc_auc')
print(f"AdaBoost AUC: {scores_ada.mean():.4f} ± {scores_ada.std():.4f}")

XGBoost

XGBoost is optimized for speed and performance with regularization and handling missing values. It minimizes a regularized objective that combines loss and complexity:

\mathcal{L} = \sum_{i} l(y_i, \hat{y}_i) + \sum_{k} \Omega(f_k)

where $l$ is the loss function and $\Omega(f_k) = \gamma T + \frac{1}{2}\lambda \sum_{j=1}^{T} w_j^2$ penalizes tree complexity with $T$ leaves and weights $w_j$ .

xgb_model = xgb.XGBClassifier(
    n_estimators=300, learning_rate=0.05, max_depth=6,
    subsample=0.8, colsample_bytree=0.8,
    reg_alpha=0.1, reg_lambda=1.0,
    min_child_weight=3, gamma=0.1,
    random_state=42, eval_metric='logloss', n_jobs=-1
)
scores_xgb = cross_val_score(xgb_model, X, y, cv=cv, scoring='roc_auc')
print(f"XGBoost AUC: {scores_xgb.mean():.4f} ± {scores_xgb.std():.4f}")

# XGBoost with early stopping
X_train, X_val = X.iloc[:2400], X.iloc[2400:]
y_train, y_val = y[:2400], y[2400:]

xgb_early = xgb.XGBClassifier(
    n_estimators=1000, learning_rate=0.05, max_depth=6,
    early_stopping_rounds=50, random_state=42, eval_metric='logloss'
)
xgb_early.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)
print(f"Best iteration: {xgb_early.best_iteration}")

LightGBM

LightGBM uses histogram-based splitting for faster training on large datasets.

lgb_model = lgb.LGBMClassifier(
    n_estimators=300, learning_rate=0.05, max_depth=8,
    num_leaves=63, subsample=0.8, colsample_bytree=0.8,
    reg_alpha=0.1, reg_lambda=1.0,
    min_child_samples=20, random_state=42, n_jobs=-1, verbose=-1
)
scores_lgb = cross_val_score(lgb_model, X, y, cv=cv, scoring='roc_auc')
print(f"LightGBM AUC: {scores_lgb.mean():.4f} ± {scores_lgb.std():.4f}")

# LightGBM with categorical features
X_lgb = X.copy()
X_lgb['cat_feature'] = np.random.choice(['A', 'B', 'C'], len(X))
lgb_model_cat = lgb.LGBMClassifier(n_estimators=100, random_state=42, verbose=-1)
lgb_model_cat.fit(X_lgb, y, categorical_feature=['cat_feature'])
print("LightGBM handles categorical features natively")

CatBoost

CatBoost excels with categorical features and reduces overfitting through ordered boosting.

from catboost import CatBoostClassifier, Pool

cat_model = CatBoostClassifier(
    iterations=300, learning_rate=0.05, depth=6,
    l2_leaf_reg=3, random_seed=42, verbose=0
)

# CatBoost with native categorical support
X_cat = X.copy()
X_cat['category'] = np.random.choice(['electronics', 'clothing', 'food'], len(X))
X_cat['subcategory'] = np.random.choice(['a', 'b', 'c', 'd'], len(X))

cat_features = ['category', 'subcategory']
cat_model.fit(X_cat, y, cat_features=cat_features, verbose=0)
scores_cat = cross_val_score(cat_model, X_cat, y, cv=cv, scoring='roc_auc')
print(f"CatBoost AUC: {scores_cat.mean():.4f} ± {scores_cat.std():.4f}")

Voting Ensembles

Combining models through weighted or hard voting.

# Hard voting (majority rule)
voting_hard = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
        ('xgb', xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss'))
    ],
    voting='hard'
)
scores_vh = cross_val_score(voting_hard, X, y, cv=cv, scoring='accuracy')
print(f"Hard Voting Accuracy: {scores_vh.mean():.4f}")

# Soft voting (probability averaging)
voting_soft = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
        ('xgb', xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss'))
    ],
    voting='soft',
    weights=[1, 2, 3]  # Weight XGBoost more heavily
)
scores_vs = cross_val_score(voting_soft, X, y, cv=cv, scoring='roc_auc')
print(f"Soft Voting AUC: {scores_vs.mean():.4f}")

Stacking Ensembles

Stacking trains a meta-learner on the predictions of base models.

# Simple stacking
stacking = StackingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
        ('xgb', xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss')),
        ('lgb', lgb.LGBMClassifier(n_estimators=100, random_state=42, verbose=-1))
    ],
    final_estimator=LogisticRegression(C=1.0),
    cv=5, n_jobs=-1, passthrough=False
)
scores_stack = cross_val_score(stacking, X, y, cv=cv, scoring='roc_auc')
print(f"Stacking AUC: {scores_stack.mean():.4f} ± {scores_stack.std():.4f}")

# Stacking with passthrough (includes original features in meta-learner)
stacking_passthrough = StackingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('xgb', xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss'))
    ],
    final_estimator=GradientBoostingClassifier(n_estimators=50, random_state=42),
    cv=5, passthrough=True
)
scores_pt = cross_val_score(stacking_passthrough, X, y, cv=cv, scoring='roc_auc')
print(f"Stacking (passthrough) AUC: {scores_pt.mean():.4f}")

Manual Blending

Blending is a simpler alternative to stacking using a holdout set.

from sklearn.model_selection import train_test_split

# Split into train, blend, and test sets
X_train, X_blend, y_train, y_blend = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train base models on training set
base_models = {
    'rf': RandomForestClassifier(n_estimators=100, random_state=42),
    'gb': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'xgb': xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss')
}

blend_features = []
for name, model in base_models.items():
    model.fit(X_train, y_train)
    blend_pred = model.predict_proba(X_blend)[:, 1]
    blend_features.append(blend_pred)
    print(f"{name} blend AUC: {roc_auc_score(y_blend, blend_pred):.4f}")

# Stack blend predictions
X_meta = np.column_stack(blend_features)

# Train meta-learner
meta_learner = LogisticRegression(C=1.0)
meta_learner.fit(X_meta, y_blend)

# Final predictions
from sklearn.metrics import roc_auc_score
final_pred = meta_learner.predict_proba(X_meta)[:, 1]
print(f"Blending AUC: {roc_auc_score(y_blend, final_pred):.4f}")

Advanced Stacking with Out-of-Fold Predictions

from sklearn.model_selection import KFold

def stacking_oof(X, y, models, meta_model, n_folds=5):
    """Out-of-fold stacking for robust generalization."""
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    oof_predictions = np.zeros((len(X), len(models)))
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        for i, (name, model) in enumerate(models):
            model.fit(X_train, y_train)
            oof_predictions[val_idx, i] = model.predict_proba(X_val)[:, 1]
    
    # Train meta-learner on OOF predictions
    meta_learner = meta_model
    meta_learner.fit(oof_predictions, y)
    
    # Get CV score
    meta_pred = meta_learner.predict_proba(oof_predictions)[:, 1]
    score = roc_auc_score(y, meta_pred)
    
    return meta_learner, oof_predictions, score

models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('xgb', xgb.XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss'))
]

meta_learner, oof_preds, score = stacking_oof(X, y, models, LogisticRegression())
print(f"OOF Stacking AUC: {score:.4f}")

Comparing All Approaches

results = {
    'Random Forest': scores_rf.mean(),
    'Gradient Boosting': scores_gb.mean(),
    'XGBoost': scores_xgb.mean(),
    'LightGBM': scores_lgb.mean(),
    'Soft Voting': scores_vs.mean(),
    'Stacking': scores_stack.mean(),
}

results_df = pd.DataFrame.from_dict(results, orient='index', columns=['AUC'])
results_df = results_df.sort_values('AUC', ascending=False)
print("Model Comparison:")
print(results_df.to_string())

Best Practices

Diversity is key – combine models with different inductive biases
Use OOF predictions – prevents leakage in stacking
XGBoost/LightGBM/CatBoost – start with these; they dominate tabular data
Don't over-ensemble – 3-5 well-tuned models often beat 20 mediocre ones
Weight by performance – give better models more influence in voting
Validate carefully – ensemble gains can be illusory without proper CV

Summary

Ensemble methods – bagging, boosting, stacking, and blending – are the most reliable way to maximize predictive performance. XGBoost, LightGBM, and CatBoost are the workhorses of tabular data, while stacking combines their strengths. Master these techniques and you'll consistently build top-performing models.