Language Modeling

Language modeling is the task of assigning probabilities to sequences of tokens. It's the foundation of modern NLP, powering models like GPT and BERT.

Causal Language Modeling (CLM)

DfCausal Language Model

The model predicts each token based only on previous tokens (left-to-right).

import torch
import torch.nn as nn

class CausalLM(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, max_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional = nn.Embedding(max_len, d_model)

        decoder_layer = nn.TransformerDecoderLayer(
            d_model=d_model,
            nhead=num_heads,
            dim_feedforward=d_model * 4,
            batch_first=True
        )
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers)
        self.output = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        seq_len = x.size(1)
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0)

        # Causal mask
        causal_mask = torch.triu(
            torch.ones(seq_len, seq_len, device=x.device) * float('-inf'),
            diagonal=1
        )

        x = self.embedding(x) + self.positional(positions)
        x = self.decoder(x, memory=torch.zeros_like(x), tgt_mask=causal_mask)
        return self.output(x)

# Training
model = CausalLM(vocab_size=30000, d_model=512, num_heads=8, num_layers=6, max_len=512)
criterion = nn.CrossEntropyLoss()

def train_step(model, batch, optimizer):
    input_ids = batch['input_ids']
    labels = input_ids[:, 1:]  # Shift right
    input_ids = input_ids[:, :-1]

    logits = model(input_ids)
    loss = criterion(logits.reshape(-1, logits.size(-1)), labels.reshape(-1))

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    return loss.item()

Masked Language Modeling (MLM)

DfMasked Language Model

Where M is the set of masked positions.

class MaskedLM(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, max_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional = nn.Embedding(max_len, d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=num_heads,
            dim_feedforward=d_model * 4,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.output = nn.Linear(d_model, vocab_size)

    def forward(self, x, attention_mask=None):
        seq_len = x.size(1)
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0)

        x = self.embedding(x) + self.positional(positions)

        if attention_mask is not None:
            # Convert to transformer mask format
            src_key_padding_mask = (attention_mask == 0)
        else:
            src_key_padding_mask = None

        x = self.encoder(x, src_key_padding_mask=src_key_padding_mask)
        return self.output(x)

# MLM training with BERT-style masking
def mask_tokens(input_ids, vocab_size, mask_token_id=103, mask_prob=0.15):
    labels = input_ids.clone()
    probability_matrix = torch.full(input_ids.shape, mask_prob)

    # Don't mask special tokens
    special_mask = (input_ids == 101) | (input_ids == 102) | (input_ids == 0)
    probability_matrix.masked_fill_(special_mask, value=0.0)

    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # Only compute loss on masked tokens

    # 80% mask, 10% random, 10% keep
    indices_replaced = torch.bernoulli(torch.full(input_ids.shape, 0.8)).bool() & masked_indices
    input_ids[indices_replaced] = mask_token_id

    indices_random = torch.bernoulli(torch.full(input_ids.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(vocab_size, input_ids.shape)
    input_ids[indices_random] = random_words[indices_random]

    return input_ids, labels

Perplexity

Perplexity measures how well a probability model predicts a sample. Lower perplexity indicates better modeling.

DfPerplexity

import math

def compute_perplexity(model, tokenizer, text):
    encodings = tokenizer(text, return_tensors="pt")
    input_ids = encodings.input_ids

    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss

    return math.exp(loss.item())

# Compare models
texts = [
    "The quick brown fox jumps over the lazy dog.",
    "Colorless green ideas sleep furiously."
]

for text in texts:
    ppl = compute_perplexity(model, tokenizer, text)
    print(f"PPL: {ppl:.2f} for '{text[:30]}...'")

Language Modeling Comparison

Model	Type	Direction	Pre-training	Best For
GPT-2	Causal	Left-to-right	LM	Generation
BERT	Masked	Bidirectional	MLM + NSP	Understanding
XLNet	Permutation	All orders	Permutation LM	Both
T5	Prefix	Encoder-decoder	Span corruption	Multi-task
RoBERTa	Masked	Bidirectional	Dynamic MLM	Understanding

Perplexity is the inverse probability of the test set, normalized by the number of tokens. A perplexity of 100 means the model is on average uncertain between 100 tokens at each position.

Temperature and Perplexity

DfPerplexity with Temperature

def perplexity_with_temperature(model, tokenizer, text, temperature=1.0):
    encodings = tokenizer(text, return_tensors="pt")
    input_ids = encodings.input_ids

    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits / temperature

    # Compute per-token probabilities
    shift_logits = logits[:, :-1, :].contiguous()
    shift_labels = input_ids[:, 1:].contiguous()

    loss_fn = nn.CrossEntropyLoss()
    loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

    return math.exp(loss.item())

Evaluation Metrics

Metric	Description	Range	Good Value
Perplexity	Prediction uncertainty	[1, ∞)	< 50
Bits per token	Information content	[0, ∞)	< 5
Accuracy	Next token accuracy	[0, 100]	> 60
Top-5 accuracy	Correct in top 5	[0, 100]	> 80

Language Modeling

Language Modeling

Causal Language Modeling (CLM)

DfCausal Language Model

Masked Language Modeling (MLM)

DfMasked Language Model

Perplexity

DfPerplexity

Language Modeling Comparison

Temperature and Perplexity

DfPerplexity with Temperature

Evaluation Metrics

Perplexity Calculation

Premium Content

Need Expert NLP Help?