Rare Events and Imbalanced Learning

Fraud detection, disease diagnosis, defect identification – the rare class is often the most important. Standard models optimize for accuracy and ignore the minority class. Imbalanced learning techniques fix this by rebalancing the problem.

The Imbalanced Data Problem

When 99% of transactions are legitimate, a model that predicts "legitimate" for everything achieves 99% accuracy while catching zero fraud. Accuracy is meaningless here.

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, roc_auc_score, average_precision_score,
    precision_recall_curve, f1_score, confusion_matrix
)
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier
import warnings
warnings.filterwarnings('ignore')

Generate Imbalanced Dataset

X, y = make_classification(
    n_samples=10000, n_features=20, n_informative=10,
    n_redundant=5, weights=[0.97, 0.03],
    flip_y=0.01, random_state=42
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Class distribution:")
print(f"  Class 0 (majority): {(y == 0).sum()} ({(y == 0).mean():.1%})")
print(f"  Class 1 (minority): {(y == 1).sum()} ({(y == 1).mean():.1%})")
print(f"  Imbalance ratio: {(y == 0).sum() / (y == 1).sum():.1f}:1")

Baseline Model (No Resampling)

# Standard model – ignores minority class
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
y_prob = lr.predict_proba(X_test)[:, 1]

print("Baseline Logistic Regression:")
print(classification_report(y_test, y_pred))
print(f"ROC-AUC: {roc_auc_score(y_test, y_prob):.4f}")
print(f"Average Precision: {average_precision_score(y_test, y_prob):.4f}")

Resampling Techniques

Oversampling: SMOTE

SMOTE creates synthetic minority samples by interpolating between existing ones. For a minority sample $x_i$ , SMOTE selects one of its $k$ nearest neighbors $x_{nn}$ and generates:

x_{new} = x_i + \lambda \cdot (x_{nn} - x_i)

where $\lambda \sim \text{Uniform}(0, 1)$ . This creates synthetic samples along the line segment connecting two minority instances, enriching the decision boundary.

smote = SMOTE(sampling_strategy=0.5, random_state=42, k_neighbors=5)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

print(f"Before SMOTE: {np.bincount(y_train)}")
print(f"After SMOTE: {np.bincount(y_smote)}")

# Train on resampled data
lr_smote = LogisticRegression(max_iter=1000, random_state=42)
lr_smote.fit(X_smote, y_smote)
y_pred_smote = lr_smote.predict(X_test)
y_prob_smote = lr_smote.predict_proba(X_test)[:, 1]

print("\nAfter SMOTE:")
print(classification_report(y_test, y_pred_smote))
print(f"ROC-AUC: {roc_auc_score(y_test, y_prob_smote):.4f}")

ADASYN

ADASYN focuses on harder-to-learn minority examples by generating more samples near the decision boundary.

adasyn = ADASYN(sampling_strategy=0.5, random_state=42)
X_adasyn, y_adasyn = adasyn.fit_resample(X_train, y_train)

print(f"ADASYN generated: {len(y_adasyn) - len(y_train)} new samples")

lr_adasyn = LogisticRegression(max_iter=1000, random_state=42)
lr_adasyn.fit(X_adasyn, y_adasyn)
y_prob_adasyn = lr_adasyn.predict_proba(X_test)[:, 1]

print(f"ADASYN ROC-AUC: {roc_auc_score(y_test, y_prob_adasyn):.4f}")

Borderline-SMOTE

Only synthesizes samples near the decision boundary.

bsmote = BorderlineSMOTE(sampling_strategy=0.5, random_state=42)
X_bsmote, y_bsmote = bsmote.fit_resample(X_train, y_train)

lr_bsmote = LogisticRegression(max_iter=1000, random_state=42)
lr_bsmote.fit(X_bsmote, y_bsmote)
y_prob_bsmote = lr_bsmote.predict_proba(X_test)[:, 1]

print(f"Borderline-SMOTE ROC-AUC: {roc_auc_score(y_test, y_prob_bsmote):.4f}")

Undersampling

# Random undersampling
rus = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
X_rus, y_rus = rus.fit_resample(X_train, y_train)

print(f"Random undersampling: {len(X_rus)} samples (was {len(X_train)})")

# Tomek Links – remove majority samples that form Tomek pairs
tomek = TomekLinks()
X_tomek, y_tomek = tomek.fit_resample(X_train, y_train)
print(f"Tomek links removed: {len(X_train) - len(X_tomek)} majority samples")

# Combined: SMOTE + Tomek
smt = SMOTETomek(random_state=42)
X_smt, y_smt = smt.fit_resample(X_train, y_train)
print(f"SMOTE+Tomek: {len(X_smt)} samples")

Cost-Sensitive Learning

Penalize misclassification of the minority class more heavily.

# Class weights
lr_weighted = LogisticRegression(
    class_weight='balanced', max_iter=1000, random_state=42
)
lr_weighted.fit(X_train, y_train)
y_prob_weighted = lr_weighted.predict_proba(X_test)[:, 1]

print(f"Weighted LR ROC-AUC: {roc_auc_score(y_test, y_prob_weighted):.4f}")

# Manual class weights
weights = {0: 1, 1: 10}  # 10x penalty for minority class
lr_manual = LogisticRegression(class_weight=weights, max_iter=1000, random_state=42)
lr_manual.fit(X_train, y_train)
y_prob_manual = lr_manual.predict_proba(X_test)[:, 1]

print(f"Manual weights ROC-AUC: {roc_auc_score(y_test, y_prob_manual):.4f}")

# Random Forest with class weights
rf_weighted = RandomForestClassifier(
    n_estimators=100, class_weight='balanced', random_state=42
)
rf_weighted.fit(X_train, y_train)
y_prob_rf = rf_weighted.predict_proba(X_test)[:, 1]
print(f"Weighted RF ROC-AUC: {roc_auc_score(y_test, y_prob_rf):.4f}")

Threshold Tuning

The default 0.5 threshold is rarely optimal for imbalanced data.

def find_optimal_threshold(y_true, y_prob, metric='f1'):
    """Find threshold that maximizes a given metric."""
    thresholds = np.arange(0.1, 0.9, 0.01)
    scores = []
    
    for thresh in thresholds:
        y_pred = (y_prob >= thresh).astype(int)
        if metric == 'f1':
            score = f1_score(y_true, y_pred)
        elif metric == 'precision':
            score = precision_score(y_true, y_pred)
        elif metric == 'recall':
            score = recall_score(y_true, y_pred)
        scores.append(score)
    
    best_idx = np.argmax(scores)
    return thresholds[best_idx], scores[best_idx]

from sklearn.metrics import precision_score, recall_score

best_thresh, best_f1 = find_optimal_threshold(y_test, y_prob_weighted, 'f1')
print(f"Optimal threshold (F1): {best_thresh:.2f}")
print(f"F1 at optimal threshold: {best_f1:.4f}")

# Apply optimal threshold
y_pred_optimal = (y_prob_weighted >= best_thresh).astype(int)
print("\nClassification report at optimal threshold:")
print(classification_report(y_test, y_pred_optimal))

Evaluation Metrics for Imbalanced Data

# Precision-Recall curve
from sklearn.metrics import precision_recall_curve, auc

precision, recall, thresholds = precision_recall_curve(y_test, y_prob_weighted)
pr_auc = auc(recall, precision)
print(f"PR-AUC: {pr_auc:.4f}")

# Matthews Correlation Coefficient
from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(y_test, y_pred_optimal)
print(f"MCC: {mcc:.4f}")

# Cohen's Kappa
from sklearn.metrics import cohen_kappa_score
kappa = cohen_kappa_score(y_test, y_pred_optimal)
print(f"Cohen's Kappa: {kappa:.4f}")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred_optimal)
print(f"\nConfusion Matrix:")
print(f"  TN={cm[0,0]}, FP={cm[0,1]}")
print(f"  FN={cm[1,0]}, TP={cm[1,1]}")

Ensemble Methods for Imbalance

# Balanced Random Forest
brf = BalancedRandomForestClassifier(
    n_estimators=100, sampling_strategy='all',
    replacement=True, random_state=42
)
brf.fit(X_train, y_train)
y_prob_brf = brf.predict_proba(X_test)[:, 1]
print(f"Balanced RF ROC-AUC: {roc_auc_score(y_test, y_prob_brf):.4f}")

# Balanced Bagging
bb = BalancedBaggingClassifier(
    n_estimators=10, sampling_strategy='auto',
    replacement=False, random_state=42
)
bb.fit(X_train, y_train)
y_prob_bb = bb.predict_proba(X_test)[:, 1]
print(f"Balanced Bagging ROC-AUC: {roc_auc_score(y_test, y_prob_bb):.4f}")

EasyEnsemble and BalanceCascade

from imblearn.ensemble import EasyEnsembleClassifier, BalancedBaggingClassifier

# EasyEnsemble – bagging with undersampling
ee = EasyEnsembleClassifier(
    n_estimators=10, random_state=42,
    base_estimator=LogisticRegression(max_iter=1000)
)
ee.fit(X_train, y_train)
y_prob_ee = ee.predict_proba(X_test)[:, 1]
print(f"EasyEnsemble ROC-AUC: {roc_auc_score(y_test, y_prob_ee):.4f}")

Best Practices

Use PR-AUC, not ROC-AUC – PR-AUC focuses on the minority class
Tune the threshold – 0.5 is rarely optimal
Combine over/under sampling – SMOTE+Tomek works well
Cost-sensitive learning – simpler than resampling, often equally effective
Ensemble methods – BalancedRandomForest and EasyEnsemble are strong baselines
Evaluate on the original distribution – resampled data is for training only

Summary

Imbalanced learning requires special treatment. SMOTE and ADASYN generate minority samples, cost-sensitive learning adjusts penalties, and threshold tuning optimizes the decision boundary. Always use PR-AUC and F1 instead of accuracy for evaluation.