Tabular Deep Learning

Tabular data is everywhere. Learn when deep learning beats gradient boosting and how to apply modern architectures like TabNet and FT-Transformer.

When to Use DL for Tabular

TabNet: Attentive Interpretable Tabular Learning

import torch
import torch.nn as nn
import torch.nn.functional as F

class AttentiveTransformer(nn.Module):
    def __init__(self, input_dim, output_dim, relaxation_factor=1.5):
        super().__init__()
        self.fc = nn.Linear(input_dim, output_dim)
        self.bn = nn.BatchNorm1d(output_dim)
        self.relaxation_factor = relaxation_factor
    
    def forward(self, x, processed_features):
        x = self.fc(x)
        x = self.bn(x)
        
        # Sparsemax attention
        x = self._sparsemax(x, self.relaxation_factor)
        
        return x * processed_features
    
    def _sparsemax(self, x, relaxation_factor):
        """Sparsemax activation for feature selection"""
        x = x / relaxation_factor
        x = F.relu(x)
        
        # Normalize
        x_sum = x.sum(dim=-1, keepdim=True) + 1e-8
        return x / x_sum

class FeatureTransformer(nn.Module):
    def __init__(self, input_dim, output_dim, shared=None):
        super().__init__()
        self.shared = shared
        self.fc = nn.Linear(input_dim, output_dim)
        self.bn = nn.BatchNorm1d(output_dim)
    
    def forward(self, x):
        if self.shared is not None:
            x = self.shared(x)
        x = self.fc(x)
        x = self.bn(x)
        return F.relu(x)

class TabNet(nn.Module):
    def __init__(self, input_dim, output_dim, n_steps=3, relaxation_factor=1.5,
                 hidden_dim=64, n_independent=2, n_shared=2):
        super().__init__()
        self.n_steps = n_steps
        self.hidden_dim = hidden_dim
        
        # Shared feature transformer
        shared_layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU()
        )
        
        # Step-specific transformers
        self.feature_transformers = nn.ModuleList([
            FeatureTransformer(input_dim, hidden_dim, shared_layers if i < n_shared else None)
            for i in range(n_steps)
        ])
        
        # Attention
        self.attention_maps = nn.ModuleList([
            AttentiveTransformer(hidden_dim, input_dim, relaxation_factor)
            for _ in range(n_steps)
        ])
        
        # Final output
        self.final_fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        batch_size = x.size(0)
        
        # Initial decision
        aggregated = torch.zeros(batch_size, self.hidden_dim, device=x.device)
        attention = torch.ones(batch_size, x.size(1), device=x.device)
        
        for step in range(self.n_steps):
            # Transform features
            transformed = self.feature_transformers[step](x * attention)
            
            # Get attention mask
            attention = self.attention_maps[step](transformed, attention)
            
            # Aggregate
            aggregated += transformed
        
        return self.final_fc(aggregated)

# Training TabNet
def train_tabnet(model, train_loader, val_loader, epochs=100):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    criterion = nn.CrossEntropyLoss()
    
    best_val_acc = 0
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        scheduler.step()
        
        # Validation
        model.eval()
        correct = 0
        total = 0
        
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                output = model(X_batch)
                _, predicted = torch.max(output, 1)
                total += y_batch.size(0)
                correct += (predicted == y_batch).sum().item()
        
        val_acc = correct / total
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_tabnet.pth')
        
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}: Loss={train_loss/len(train_loader):.4f}, Val Acc={val_acc:.4f}")

FT-Transformer (Feature Tokenizer + Transformer)

import torch
import torch.nn as nn

class FeatureTokenizer(nn.Module):
    def __init__(self, num_features, embed_dim):
        super().__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(1, embed_dim) for _ in range(num_features)
        ])
        self.num_features = num_features
    
    def forward(self, x):
        batch_size = x.size(0)
        tokens = []
        
        for i in range(self.num_features):
            # Each feature gets its own embedding
            feature_val = x[:, i:i+1]
            token = self.embeddings[i](torch.zeros(batch_size, 1, device=x.device))
            token = token + feature_val.unsqueeze(-1) * 0.1  # Simple scaling
            tokens.append(token)
        
        return torch.cat(tokens, dim=1)

class FTTransformer(nn.Module):
    def __init__(self, num_features, num_classes, embed_dim=192, 
                 num_heads=8, num_layers=6, dropout=0.1):
        super().__init__()
        
        self.tokenizer = FeatureTokenizer(num_features, embed_dim)
        
        # CLS token
        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
        
        # Positional embedding
        self.pos_embed = nn.Parameter(torch.randn(1, num_features + 1, embed_dim))
        
        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=embed_dim * 4,
            dropout=dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)
    
    def forward(self, x):
        batch_size = x.size(0)
        
        # Tokenize features
        tokens = self.tokenizer(x)
        
        # Add CLS token
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        tokens = torch.cat([cls_tokens, tokens], dim=1)
        
        # Add positional embeddings
        tokens = tokens + self.pos_embed
        
        # Transformer
        out = self.transformer(tokens)
        out = self.norm(out)
        
        # Use CLS token for prediction
        cls_output = out[:, 0]
        
        return self.head(cls_output)

Wide & Deep Learning

import torch
import torch.nn as nn

class WideAndDeep(nn.Module):
    def __init__(self, num_features, num_categories_per_feature, 
                 embed_dim=16, hidden_dims=[256, 128, 64], num_classes=2):
        super().__init__()
        
        # Wide component (linear)
        self.wide = nn.Linear(num_features, num_classes)
        
        # Deep component (embeddings + MLP)
        self.embeddings = nn.ModuleList([
            nn.Embedding(cat + 1, embed_dim) for cat in num_categories_per_feature
        ])
        
        deep_input_dim = num_features + len(num_categories_per_feature) * embed_dim
        
        layers = []
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(deep_input_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(0.3)
            ])
            deep_input_dim = hidden_dim
        
        layers.append(nn.Linear(deep_input_dim, num_classes))
        self.deep = nn.Sequential(*layers)
    
    def forward(self, x_continuous, x_categorical):
        # Wide path
        wide_out = self.wide(x_continuous)
        
        # Deep path
        embeds = []
        for i, emb in enumerate(self.embeddings):
            embeds.append(emb(x_categorical[:, i]))
        
        deep_input = torch.cat([x_continuous] + embeds, dim=1)
        deep_out = self.deep(deep_input)
        
        # Combine
        return wide_out + deep_out

# Usage example
num_continuous = 10
num_categorical_features = [100, 50, 200, 30]  # cardinality per feature
model = WideAndDeep(num_continuous, num_categorical_features)

Model Comparison

import time
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def benchmark_models(X_train, X_test, y_train, y_test, cat_features):
    """Compare gradient boosting vs deep learning"""
    results = {}
    
    # LightGBM (baseline)
    import lightgbm as lgb
    start = time.time()
    
    train_data = lgb.Dataset(X_train, label=y_train)
    params = {'objective': 'multiclass', 'num_class': len(np.unique(y_train)), 'verbose': -1}
    model = lgb.train(params, train_data, num_boost_round=100)
    lgb_time = time.time() - start
    
    lgb_pred = model.predict(X_test).argmax(axis=1)
    results['LightGBM'] = {
        'accuracy': accuracy_score(y_test, lgb_pred),
        'f1': f1_score(y_test, lgb_pred, average='weighted'),
        'train_time': lgb_time
    }
    
    # TabNet
    from pytorch_tabnet.tab_model import TabNetClassifier
    start = time.time()
    
    tabnet = TabNetClassifier(
        n_d=64, n_a=64,
        n_steps=3,
        optimizer_params=dict(lr=2e-2),
        scheduler_params={"step_size": 50, "gamma": 0.9},
        mask_type='entmax'
    )
    tabnet.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        max_epochs=100,
        patience=10
    )
    tabnet_time = time.time() - start
    
    tabnet_pred = tabnet.predict(X_test)
    results['TabNet'] = {
        'accuracy': accuracy_score(y_test, tabnet_pred),
        'f1': f1_score(y_test, tabnet_pred, average='weighted'),
        'train_time': tabnet_time
    }
    
    return results

Best Practices

Start with LightGBM – it's the baseline for tabular data
Use DL when you have 100K+ samples and complex feature interactions
Feature tokenization is key for transformer-based tabular models
Embed categorical features rather than one-hot encoding for DL
Ensemble gradient boosting with DL for best performance