NLP Transfer Learning
Transfer learning enables NLP models trained on large general corpora to adapt to specific domains and tasks with minimal labeled data.
Transfer Learning Spectrum
Parameter-Efficient Fine-Tuning
DfLoRA (Low-Rank Adaptation)
LoRA decomposes weight updates into low-rank matrices:
where and with rank .
import torch
import torch.nn as nn
from transformers import AutoModel
class LoRALayer(nn.Module):
def __init__(self, in_features, out_features, rank=8, alpha=16):
super().__init__()
self.original = nn.Linear(in_features, out_features, bias=False)
self.original.weight.requires_grad = False
self.lora_A = nn.Parameter(torch.randn(in_features, rank) * 0.01)
self.lora_B = nn.Parameter(torch.zeros(rank, out_features))
self.scaling = alpha / rank
nn.init.kaiming_uniform_(self.lora_A)
def forward(self, x):
original_output = self.original(x)
lora_output = (x @ self.lora_A @ self.lora_B) * self.scaling
return original_output + lora_output
def merge_weights(self):
"""Merge LoRA weights into original layer for inference."""
self.original.weight.data += (self.lora_B @ self.lora_A).T * self.scaling
class LoRAModel(nn.Module):
def __init__(self, base_model_name, target_modules, rank=8):
super().__init__()
self.base_model = AutoModel.from_pretrained(base_model_name)
self.lora_layers = nn.ModuleDict()
for name, module in self.base_model.named_modules():
if any(target in name for target in target_modules):
if isinstance(module, nn.Linear):
self.lora_layers[name] = LoRALayer(
module.in_features,
module.out_features,
rank=rank
)
def forward(self, **kwargs):
outputs = self.base_model(**kwargs)
for name, module in self.base_model.named_modules():
if name in self.lora_layers:
# Apply LoRA modification
pass
return outputs
def print_trainable_parameters(self):
trainable = sum(p.numel() for p in self.parameters() if p.requires_grad)
total = sum(p.numel() for p in self.parameters())
print(f"Trainable: {trainable:,} ({100 * trainable / total:.2f}%)")
# Usage
model = LoRAModel(
"bert-base-uncased",
target_modules=["query", "value"],
rank=8
)
model.print_trainable_parameters()
Adapter Layers
| Method | Parameters | Speed | Performance | Use Case |
|---|---|---|---|---|
| Full Fine-Tune | 100% | Slow | Best | Large datasets |
| LoRA | 0.1-1% | Fast | Near-best | Resource-limited |
| Adapters | 1-5% | Fast | Good | Multi-task |
| Prefix Tuning | <1% | Fast | Good | Generation |
| Prompt Tuning | <0.1% | Fast | Moderate | Classification |
Domain Adaptation Strategies
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from torch.utils.data import DataLoader
class DomainAdapter:
def __init__(self, model_name, domain_vocab):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForMaskedLM.from_pretrained(model_name)
self.domain_vocab = domain_vocab
def extend_vocabulary(self, domain_texts):
"""Add domain-specific tokens to vocabulary."""
new_tokens = self._extract_domain_tokens(domain_texts)
num_added = self.tokenizer.add_tokens(new_tokens)
self.model.resize_token_embeddings(len(self.tokenizer))
return num_added
def _extract_domain_tokens(self, texts, min_freq=5):
"""Extract frequent domain-specific tokens."""
from collections import Counter
token_freq = Counter()
for text in texts:
tokens = self.tokenizer.tokenize(text.lower())
token_freq.update(tokens)
return [token for token, freq in token_freq.items()
if freq >= min_freq and token not in self.tokenizer.get_vocab()]
def continual_pretraining(self, domain_corpus, epochs=3, lr=5e-5):
"""Continue pre-training on domain-specific data."""
optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr)
self.model.train()
for epoch in range(epochs):
total_loss = 0
for batch in domain_corpus:
inputs = self.tokenizer(
batch,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
)
labels = inputs["input_ids"].clone()
masked_indices = torch.bernoulli(torch.ones_like(labels) * 0.15).bool()
labels[~masked_indices] = -100
inputs["input_ids"][masked_indices] = self.tokenizer.mask_token_id
outputs = self.model(**inputs, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
optimizer.zero_grad()
total_loss += loss.item()
print(f"Epoch {epoch + 1}: Loss = {total_loss / len(domain_corpus):.4f}")
Multi-Task Learning
DfMulti-Task Loss
The combined loss across tasks with uncertainty-based weighting:
where are learned task-specific weights.
import torch
import torch.nn as nn
class MultiTaskNLPModel(nn.Module):
def __init__(self, base_model, task_configs):
super().__init__()
self.base_model = base_model
self.task_heads = nn.ModuleDict()
self.log_vars = nn.ParameterDict()
for task_name, config in task_configs.items():
self.task_heads[task_name] = nn.Linear(
config["hidden_size"],
config["num_classes"]
)
self.log_vars[task_name] = nn.Parameter(torch.zeros(1))
def forward(self, input_ids, attention_mask, task_name):
base_output = self.base_model(input_ids, attention_mask)
hidden_states = base_output.last_hidden_state[:, 0, :]
task_output = self.task_heads[task_name](hidden_states)
return task_output
def compute_loss(self, outputs, labels, task_name):
task_loss = nn.CrossEntropyLoss()(outputs, labels)
precision = torch.exp(-self.log_vars[task_name])
total_loss = precision * task_loss + self.log_vars[task_name]
return total_loss, task_loss
def get_task_weights(self):
"""Get learned task weights."""
return {
task: torch.exp(-log_var).item()
for task, log_var in self.log_vars.items()
}
# Task configurations
task_configs = {
"sentiment": {"hidden_size": 768, "num_classes": 3},
"ner": {"hidden_size": 768, "num_classes": 9},
"similarity": {"hidden_size": 768, "num_classes": 1},
}
Transfer Learning Best Practices
| Strategy | When to Use | Key Consideration |
|---|---|---|
| Feature extraction | Small target dataset | Freeze base model |
| Full fine-tuning | Large target dataset | Risk of catastrophic forgetting |
| LoRA/Adapters | Medium dataset, limited compute | Balance efficiency and performance |
| Continual pretraining | Domain shift | Preserve general knowledge |
| Multi-task learning | Related tasks available | Task balancing critical |
Key Takeaways
- Transfer learning reduces data requirements for domain-specific NLP tasks
- Parameter-efficient methods (LoRA, adapters) enable fine-tuning on consumer hardware
- Domain adaptation through continual pretraining improves domain-specific performance
- Multi-task learning shares representations across related NLP tasks
- Task weighting and regularization prevent negative transfer between tasks