Imbalanced Data: SMOTE, Class Weights and Sampling

Imbalanced datasets are common in fraud detection, medical diagnosis, and anomaly detection. This lesson covers techniques to handle class imbalance.

The Imbalanced Data Problem

Resampling Techniques

import pandas as pd
import numpy as np
from sklearn.utils import resample

# Split data by class
df_majority = df[df['target'] == 0]
df_minority = df[df['target'] == 1]

# 1. Random Oversampling
df_minority_upsampled = resample(
    df_minority,
    replace=True,
    n_samples=len(df_majority),
    random_state=42
)
df_oversampled = pd.concat([df_majority, df_minority_upsampled])

# 2. Random Undersampling
df_majority_downsampled = resample(
    df_majority,
    replace=False,
    n_samples=len(df_minority),
    random_state=42
)
df_undersampled = pd.concat([df_majority_downsampled, df_minority])

# 3. Tomek Links (undersampling with cleaning)
from imblearn.under_sampling import TomekLinks
tl = TomekLinks()
X_resampled, y_resampled = tl.fit_resample(X, y)

# 4. Edited Nearest Neighbors
from imblearn.under_sampling import EditedNearestNeighbors
enn = EditedNearestNeighbors()
X_resampled, y_resampled = enn.fit_resample(X, y)

SMOTE Algorithm

The SMOTE formula generates a synthetic sample along the line between a minority sample and one of its $k$ nearest neighbors:

x_{\text{new}} = x_i + \lambda \cdot (x_{nn} - x_i), \quad \lambda \sim U(0, 1)

from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE

# Basic SMOTE
smote = SMOTE(random_state=42, k_neighbors=5)
X_smote, y_smote = smote.fit_resample(X, y)

# Borderline SMOTE (focuses on decision boundary)
bsmote = BorderlineSMOTE(random_state=42)
X_bsmote, y_bsmote = bsmote.fit_resample(X, y)

# ADASYN (adaptive synthetic sampling)
adasyn = ADASYN(random_state=42)
X_adasyn, y_adasyn = adasyn.fit_resample(X, y)

# SMOTE with Tomek Links (combination)
from imblearn.combine import SMOTETomek
smt = SMOTETomek(random_state=42)
X_smt, y_smt = smt.fit_resample(X, y)

Class Weights

Class weights adjust the loss function to penalize minority class misclassifications more heavily:

w_c = \frac{N}{k \cdot N_c}

where $N$ is total samples, $N_c$ is samples in class $c$ , and $k$ is the number of classes.

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Method 1: 'balanced' parameter
model = LogisticRegression(class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

# Method 2: Custom class weights
# Inversely proportional to class frequency
class_counts = np.bincount(y_train)
class_weights = len(y_train) / (2 * class_counts)
weight_dict = {0: class_weights[0], 1: class_weights[1]}

model = LogisticRegression(class_weight=weight_dict, random_state=42)

# Method 3: Sample weights in fit
sample_weights = np.where(y_train == 1, 10, 1)  # Higher weight for minority
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train, sample_weight=sample_weights)

# For Random Forest
rf = RandomForestClassifier(
    class_weight='balanced_subsample',  # Adjusts weights per bootstrap
    random_state=42
)

Evaluation Metrics

Key metrics for imbalanced data:

F_1 = 2 \cdot \frac{\text{Precision} \cdot \text{Recall}}{\text{Precision} + \text{Recall}}

\text{MCC} = \frac{TP \cdot TN - FP \cdot FN}{\sqrt{(TP+FP)(TP+FN)(TN+FP)(TN+FN)}}

from sklearn.metrics import (
    classification_report, confusion_matrix,
    precision_recall_curve, roc_curve, auc,
    f1_score, roc_auc_score, average_precision_score,
    matthews_corrcoef, balanced_accuracy_score
)

# Avoid accuracy for imbalanced data
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Key metrics
print(classification_report(y_test, y_pred))

# F1 Score (harmonic mean of precision and recall)
f1 = f1_score(y_test, y_pred)

# AUROC (Area Under ROC Curve)
auroc = roc_auc_score(y_test, y_prob)

# AUPRC (Area Under Precision-Recall Curve)
auprc = average_precision_score(y_test, y_prob)

# Matthews Correlation Coefficient
mcc = matthews_corrcoef(y_test, y_pred)

# Balanced Accuracy
bal_acc = balanced_accuracy_score(y_test, y_pred)

# Visualize metrics
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
axes[0].plot(fpr, tpr, label=f'AUROC = {auroc:.3f}')
axes[0].plot([0, 1], [0, 1], 'k--')
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('ROC Curve')
axes[0].legend()

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, y_prob)
axes[1].plot(recall, precision, label=f'AUPRC = {auprc:.3f}')
axes[1].set_xlabel('Recall')
axes[1].set_ylabel('Precision')
axes[1].set_title('Precision-Recall Curve')
axes[1].legend()

plt.tight_layout()
plt.show()

Threshold Optimization

from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_test, y_prob)

# Find threshold that maximizes F1
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
optimal_threshold = thresholds[np.argmax(f1_scores)]

# Apply custom threshold
y_pred_custom = (y_prob >= optimal_threshold).astype(int)

print(f"Optimal threshold: {optimal_threshold:.3f}")
print(f"F1 at optimal threshold: {f1_score(y_test, y_pred_custom):.3f}")

# Cost-sensitive threshold (if misclassification costs differ)
cost_fp = 1  # Cost of false positive
cost_fn = 10  # Cost of false negative (higher for rare class)

total_costs = (precision * cost_fp + (1 - recall) * cost_fn)
optimal_cost_threshold = thresholds[np.argmin(total_costs)]

Ensemble Methods for Imbalanced Data

from imblearn.ensemble import (
    BalancedRandomForestClassifier,
    BalancedBaggingClassifier,
    EasyEnsembleClassifier,
    RUSBoostClassifier
)

# Balanced Random Forest
brf = BalancedRandomForestClassifier(
    n_estimators=100,
    random_state=42
)
brf.fit(X_train, y_train)

# Balanced Bagging
bbc = BalancedBaggingClassifier(
    n_estimators=10,
    random_state=42
)
bbc.fit(X_train, y_train)

# Easy Ensemble
ee = EasyEnsembleClassifier(
    n_estimators=10,
    random_state=42
)
ee.fit(X_train, y_train)

# RUSBoost
rusboost = RUSBoostClassifier(
    n_estimators=100,
    random_state=42
)
rusboost.fit(X_train, y_train)

Key Takeaways

Never use accuracy alone for imbalanced problems
Use SMOTE or class weights as first approaches
Optimize decision threshold for your cost function
Consider ensemble methods like BalancedRandomForest
Always evaluate with AUPRC for severely imbalanced data

Imbalanced Data: SMOTE, Class Weights and Sampling

Imbalanced Data: SMOTE, Class Weights and Sampling

The Imbalanced Data Problem

Resampling Techniques

SMOTE Algorithm

Class Weights

Evaluation Metrics

Threshold Optimization

Ensemble Methods for Imbalanced Data

Key Takeaways

Premium Content

Need Expert Data Science Help?