Language Modeling
Language modeling is the task of assigning probabilities to sequences of tokens. It's the foundation of modern NLP, powering models like GPT and BERT.
Causal Language Modeling (CLM)
DfCausal Language Model
The model predicts each token based only on previous tokens (left-to-right).
import torch
import torch.nn as nn
class CausalLM(nn.Module):
def __init__(self, vocab_size, d_model, num_heads, num_layers, max_len):
super().__init__()
self.embedding = nn.Embedding(vocab_size, d_model)
self.positional = nn.Embedding(max_len, d_model)
decoder_layer = nn.TransformerDecoderLayer(
d_model=d_model,
nhead=num_heads,
dim_feedforward=d_model * 4,
batch_first=True
)
self.decoder = nn.TransformerDecoder(decoder_layer, num_layers)
self.output = nn.Linear(d_model, vocab_size)
def forward(self, x):
seq_len = x.size(1)
positions = torch.arange(seq_len, device=x.device).unsqueeze(0)
# Causal mask
causal_mask = torch.triu(
torch.ones(seq_len, seq_len, device=x.device) * float('-inf'),
diagonal=1
)
x = self.embedding(x) + self.positional(positions)
x = self.decoder(x, memory=torch.zeros_like(x), tgt_mask=causal_mask)
return self.output(x)
# Training
model = CausalLM(vocab_size=30000, d_model=512, num_heads=8, num_layers=6, max_len=512)
criterion = nn.CrossEntropyLoss()
def train_step(model, batch, optimizer):
input_ids = batch['input_ids']
labels = input_ids[:, 1:] # Shift right
input_ids = input_ids[:, :-1]
logits = model(input_ids)
loss = criterion(logits.reshape(-1, logits.size(-1)), labels.reshape(-1))
loss.backward()
optimizer.step()
optimizer.zero_grad()
return loss.item()
Masked Language Modeling (MLM)
DfMasked Language Model
Where M is the set of masked positions.
class MaskedLM(nn.Module):
def __init__(self, vocab_size, d_model, num_heads, num_layers, max_len):
super().__init__()
self.embedding = nn.Embedding(vocab_size, d_model)
self.positional = nn.Embedding(max_len, d_model)
encoder_layer = nn.TransformerEncoderLayer(
d_model=d_model,
nhead=num_heads,
dim_feedforward=d_model * 4,
batch_first=True
)
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers)
self.output = nn.Linear(d_model, vocab_size)
def forward(self, x, attention_mask=None):
seq_len = x.size(1)
positions = torch.arange(seq_len, device=x.device).unsqueeze(0)
x = self.embedding(x) + self.positional(positions)
if attention_mask is not None:
# Convert to transformer mask format
src_key_padding_mask = (attention_mask == 0)
else:
src_key_padding_mask = None
x = self.encoder(x, src_key_padding_mask=src_key_padding_mask)
return self.output(x)
# MLM training with BERT-style masking
def mask_tokens(input_ids, vocab_size, mask_token_id=103, mask_prob=0.15):
labels = input_ids.clone()
probability_matrix = torch.full(input_ids.shape, mask_prob)
# Don't mask special tokens
special_mask = (input_ids == 101) | (input_ids == 102) | (input_ids == 0)
probability_matrix.masked_fill_(special_mask, value=0.0)
masked_indices = torch.bernoulli(probability_matrix).bool()
labels[~masked_indices] = -100 # Only compute loss on masked tokens
# 80% mask, 10% random, 10% keep
indices_replaced = torch.bernoulli(torch.full(input_ids.shape, 0.8)).bool() & masked_indices
input_ids[indices_replaced] = mask_token_id
indices_random = torch.bernoulli(torch.full(input_ids.shape, 0.5)).bool() & masked_indices & ~indices_replaced
random_words = torch.randint(vocab_size, input_ids.shape)
input_ids[indices_random] = random_words[indices_random]
return input_ids, labels
Perplexity
Perplexity measures how well a probability model predicts a sample. Lower perplexity indicates better modeling.
DfPerplexity
import math
def compute_perplexity(model, tokenizer, text):
encodings = tokenizer(text, return_tensors="pt")
input_ids = encodings.input_ids
with torch.no_grad():
outputs = model(input_ids, labels=input_ids)
loss = outputs.loss
return math.exp(loss.item())
# Compare models
texts = [
"The quick brown fox jumps over the lazy dog.",
"Colorless green ideas sleep furiously."
]
for text in texts:
ppl = compute_perplexity(model, tokenizer, text)
print(f"PPL: {ppl:.2f} for '{text[:30]}...'")
Language Modeling Comparison
| Model | Type | Direction | Pre-training | Best For |
|---|---|---|---|---|
| GPT-2 | Causal | Left-to-right | LM | Generation |
| BERT | Masked | Bidirectional | MLM + NSP | Understanding |
| XLNet | Permutation | All orders | Permutation LM | Both |
| T5 | Prefix | Encoder-decoder | Span corruption | Multi-task |
| RoBERTa | Masked | Bidirectional | Dynamic MLM | Understanding |
Perplexity is the inverse probability of the test set, normalized by the number of tokens. A perplexity of 100 means the model is on average uncertain between 100 tokens at each position.
Temperature and Perplexity
DfPerplexity with Temperature
def perplexity_with_temperature(model, tokenizer, text, temperature=1.0):
encodings = tokenizer(text, return_tensors="pt")
input_ids = encodings.input_ids
with torch.no_grad():
outputs = model(input_ids)
logits = outputs.logits / temperature
# Compute per-token probabilities
shift_logits = logits[:, :-1, :].contiguous()
shift_labels = input_ids[:, 1:].contiguous()
loss_fn = nn.CrossEntropyLoss()
loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
return math.exp(loss.item())
Evaluation Metrics
| Metric | Description | Range | Good Value |
|---|---|---|---|
| Perplexity | Prediction uncertainty | [1, β) | < 50 |
| Bits per token | Information content | [0, β) | < 5 |
| Accuracy | Next token accuracy | [0, 100] | > 60 |
| Top-5 accuracy | Correct in top 5 | [0, 100] | > 80 |