LLM Model Distillation: Knowledge Distillation and Compression
Model distillation reduces the size and computational cost of LLMs while preserving performance. Production systems use distillation to create smaller, faster models suitable for edge deployment and cost-sensitive applications.
Distillation Pipeline
Distillation Techniques
1. Knowledge Distillation
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional
class DistillationLoss(nn.Module):
def __init__(self, temperature: float = 2.0, alpha: float = 0.5):
super().__init__()
self.temperature = temperature
self.alpha = alpha
def forward(self, student_logits: torch.Tensor,
teacher_logits: torch.Tensor,
labels: torch.Tensor) -> torch.Tensor:
soft_loss = F.kl_div(
F.log_softmax(student_logits / self.temperature, dim=-1),
F.softmax(teacher_logits / self.temperature, dim=-1),
reduction="batchmean"
) * (self.temperature ** 2)
hard_loss = F.cross_entropy(student_logits, labels)
return self.alpha * soft_loss + (1 - self.alpha) * hard_loss
class DistillationTrainer:
def __init__(self, teacher: nn.Module, student: nn.Module,
temperature: float = 2.0, alpha: float = 0.5):
self.teacher = teacher
self.student = student
self.loss_fn = DistillationLoss(temperature, alpha)
self.teacher.eval()
def train_step(self, batch: dict, optimizer) -> float:
self.student.train()
with torch.no_grad():
teacher_output = self.teacher(batch["input_ids"])
student_output = self.student(batch["input_ids"])
loss = self.loss_fn(student_output, teacher_output, batch["labels"])
optimizer.zero_grad()
loss.backward()
optimizer.step()
return loss.item()
2. Quantization-Aware Training
class QuantizationConfig:
def __init__(self, bits: int = 8, group_size: int = 128):
self.bits = bits
self.group_size = group_size
def quantize_weight(self, weight: torch.Tensor) -> tuple:
scale = weight.abs().max(dim=-1, keepdim=True).values / (2 ** (self.bits - 1) - 1)
quantized = torch.round(weight / scale).clamp(
-(2 ** (self.bits - 1)), 2 ** (self.bits - 1) - 1
).to(torch.int8)
return quantized, scale
def compute_compression_ratio(self, original_params: int) -> dict:
original_bytes = original_params * 4
quantized_bytes = original_params * self.bits / 8
ratio = original_bytes / quantized_bytes
return {
"original_gb": original_bytes / (1024**3),
"quantized_gb": quantized_bytes / (1024**3),
"compression_ratio": ratio,
"memory_saved_pct": (1 - 1/ratio) * 100
}
3. Structured Pruning
import numpy as np
from typing import List
class StructuredPruner:
def __init__(self, sparsity: float = 0.3):
self.sparsity = sparsity
def compute_importance(self, weights: torch.Tensor,
method: str = "l2_norm") -> torch.Tensor:
if method == "l2_norm":
return torch.norm(weights, dim=-1)
elif method == "magnitude":
return weights.abs().mean(dim=-1)
elif method == "gradient":
return torch.rand(weights.shape[0])
return torch.ones(weights.shape[0])
def prune_heads(self, attention_weights: torch.Tensor,
num_heads: int, keep_ratio: float = 0.7) -> List[int]:
head_importance = self.compute_importance(attention_weights)
num_keep = int(num_heads * keep_ratio)
_, indices = torch.sort(head_importance, descending=True)
return indices[:num_keep].tolist()
def compute_sparsity_stats(self, model: nn.Module) -> dict:
total_params = 0
zero_params = 0
for param in model.parameters():
total_params += param.numel()
zero_params += (param == 0).sum().item()
return {
"total_params": total_params,
"zero_params": zero_params,
"sparsity": zero_params / max(total_params, 1),
"compression_ratio": total_params / max(total_params - zero_params, 1)
}
Key Formulas
Distillation Loss
Here,
- =Teacher soft predictions at temperature tau
- =Student soft predictions at temperature tau
- =Temperature parameter
- =Balance between soft and hard loss
Perplexity Ratio
Here,
- =Student model perplexity
- =Teacher model perplexity
Compression Comparison
| Technique | Size Reduction | Speed Improvement | Quality Loss | Best For |
|---|---|---|---|---|
| INT8 Quantization | 4x | 2-3x | <1% | General deployment |
| INT4 Quantization | 8x | 3-4x | 1-3% | Edge devices |
| Knowledge Distillation | 5-10x | 5-10x | 2-5% | Production APIs |
| Pruning (50%) | 2x | 1.5x | 1-2% | Latency reduction |
| Combined | 10-20x | 5-10x | 3-8% | Extreme constraints |
Best Practices
- Start with quantization as it requires no retraining
- Use knowledge distillation for significant size reduction
- Evaluate on domain-specific benchmarks not just general metrics
- Combine techniques for maximum compression
- Monitor quality degradation after each compression step