πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

Language Modeling

Conversational NLPPerplexity, Masked LM, and Causal LM🟒 Free Lesson

Advertisement

Language Modeling

Language modeling is the task of assigning probabilities to sequences of tokens. It's the foundation of modern NLP, powering models like GPT and BERT.

Causal Language Modeling (CLM)

DfCausal Language Model

The model predicts each token based only on previous tokens (left-to-right).

import torch
import torch.nn as nn

class CausalLM(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, max_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional = nn.Embedding(max_len, d_model)

        decoder_layer = nn.TransformerDecoderLayer(
            d_model=d_model,
            nhead=num_heads,
            dim_feedforward=d_model * 4,
            batch_first=True
        )
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers)
        self.output = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        seq_len = x.size(1)
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0)

        # Causal mask
        causal_mask = torch.triu(
            torch.ones(seq_len, seq_len, device=x.device) * float('-inf'),
            diagonal=1
        )

        x = self.embedding(x) + self.positional(positions)
        x = self.decoder(x, memory=torch.zeros_like(x), tgt_mask=causal_mask)
        return self.output(x)

# Training
model = CausalLM(vocab_size=30000, d_model=512, num_heads=8, num_layers=6, max_len=512)
criterion = nn.CrossEntropyLoss()

def train_step(model, batch, optimizer):
    input_ids = batch['input_ids']
    labels = input_ids[:, 1:]  # Shift right
    input_ids = input_ids[:, :-1]

    logits = model(input_ids)
    loss = criterion(logits.reshape(-1, logits.size(-1)), labels.reshape(-1))

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    return loss.item()

Masked Language Modeling (MLM)

DfMasked Language Model

Where M is the set of masked positions.

class MaskedLM(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, max_len):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional = nn.Embedding(max_len, d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=num_heads,
            dim_feedforward=d_model * 4,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.output = nn.Linear(d_model, vocab_size)

    def forward(self, x, attention_mask=None):
        seq_len = x.size(1)
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0)

        x = self.embedding(x) + self.positional(positions)

        if attention_mask is not None:
            # Convert to transformer mask format
            src_key_padding_mask = (attention_mask == 0)
        else:
            src_key_padding_mask = None

        x = self.encoder(x, src_key_padding_mask=src_key_padding_mask)
        return self.output(x)

# MLM training with BERT-style masking
def mask_tokens(input_ids, vocab_size, mask_token_id=103, mask_prob=0.15):
    labels = input_ids.clone()
    probability_matrix = torch.full(input_ids.shape, mask_prob)

    # Don't mask special tokens
    special_mask = (input_ids == 101) | (input_ids == 102) | (input_ids == 0)
    probability_matrix.masked_fill_(special_mask, value=0.0)

    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # Only compute loss on masked tokens

    # 80% mask, 10% random, 10% keep
    indices_replaced = torch.bernoulli(torch.full(input_ids.shape, 0.8)).bool() & masked_indices
    input_ids[indices_replaced] = mask_token_id

    indices_random = torch.bernoulli(torch.full(input_ids.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(vocab_size, input_ids.shape)
    input_ids[indices_random] = random_words[indices_random]

    return input_ids, labels

Perplexity

Perplexity measures how well a probability model predicts a sample. Lower perplexity indicates better modeling.

DfPerplexity

import math

def compute_perplexity(model, tokenizer, text):
    encodings = tokenizer(text, return_tensors="pt")
    input_ids = encodings.input_ids

    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss

    return math.exp(loss.item())

# Compare models
texts = [
    "The quick brown fox jumps over the lazy dog.",
    "Colorless green ideas sleep furiously."
]

for text in texts:
    ppl = compute_perplexity(model, tokenizer, text)
    print(f"PPL: {ppl:.2f} for '{text[:30]}...'")

Language Modeling Comparison

ModelTypeDirectionPre-trainingBest For
GPT-2CausalLeft-to-rightLMGeneration
BERTMaskedBidirectionalMLM + NSPUnderstanding
XLNetPermutationAll ordersPermutation LMBoth
T5PrefixEncoder-decoderSpan corruptionMulti-task
RoBERTaMaskedBidirectionalDynamic MLMUnderstanding

Perplexity is the inverse probability of the test set, normalized by the number of tokens. A perplexity of 100 means the model is on average uncertain between 100 tokens at each position.

Temperature and Perplexity

DfPerplexity with Temperature

def perplexity_with_temperature(model, tokenizer, text, temperature=1.0):
    encodings = tokenizer(text, return_tensors="pt")
    input_ids = encodings.input_ids

    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits / temperature

    # Compute per-token probabilities
    shift_logits = logits[:, :-1, :].contiguous()
    shift_labels = input_ids[:, 1:].contiguous()

    loss_fn = nn.CrossEntropyLoss()
    loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

    return math.exp(loss.item())

Evaluation Metrics

MetricDescriptionRangeGood Value
PerplexityPrediction uncertainty[1, ∞)< 50
Bits per tokenInformation content[0, ∞)< 5
AccuracyNext token accuracy[0, 100]> 60
Top-5 accuracyCorrect in top 5[0, 100]> 80

Perplexity Calculation

⭐

Premium Content

Language Modeling

Unlock this lesson and 900+ advanced tutorials with a Premium plan.

🎯End-to-end Projects
πŸ’ΌInterview Prep
πŸ“œCertificates
🀝Community Access

Already a member? Log in

Need Expert NLP Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement