Advanced NLP with Transformers

Transformers revolutionized NLP. Learn the architecture, attention mechanisms, and practical techniques for fine-tuning, prompting, and retrieval-augmented generation.

Transformer Architecture

\text{Attention}(Q,K,V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V

Self-Attention from Scratch

import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class SelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        
        assert self.head_dim * num_heads == embed_dim
        
        self.q_linear = nn.Linear(embed_dim, embed_dim)
        self.k_linear = nn.Linear(embed_dim, embed_dim)
        self.v_linear = nn.Linear(embed_dim, embed_dim)
        self.out_linear = nn.Linear(embed_dim, embed_dim)
    
    def forward(self, x, mask=None):
        batch_size, seq_len, _ = x.size()
        
        # Linear projections and reshape
        Q = self.q_linear(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        K = self.k_linear(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.v_linear(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        
        # Scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        
        attention = F.softmax(scores, dim=-1)
        context = torch.matmul(attention, V)
        
        # Concatenate heads
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.embed_dim)
        
        return self.out_linear(context)

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.attention = SelfAttention(embed_dim, num_heads)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(ff_dim, embed_dim),
            nn.Dropout(dropout)
        )
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, mask=None):
        # Self-attention with residual
        attn_out = self.attention(x, mask)
        x = self.norm1(x + self.dropout(attn_out))
        
        # Feed-forward with residual
        ff_out = self.ff(x)
        x = self.norm2(x + ff_out)
        
        return x

# Test
block = TransformerBlock(embed_dim=512, num_heads=8, ff_dim=2048)
x = torch.randn(2, 10, 512)  # batch=2, seq_len=10
out = block(x)
print(f"Input shape: {x.shape}, Output shape: {out.shape}")

Fine-Tuning with Hugging Face

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
import numpy as np

def fine_tune_classifier():
    # Load model and tokenizer
    model_name = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=2
    )
    
    # Load and preprocess dataset
    dataset = load_dataset("imdb")
    
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=512
        )
    
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=100,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        fp16=True
    )
    
    # Metrics
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        accuracy = (predictions == labels).mean()
        return {"accuracy": accuracy}
    
    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        compute_metrics=compute_metrics
    )
    
    trainer.train()
    
    return model, tokenizer

# LoRA fine-tuning for efficiency
from peft import LoraConfig, get_peft_model, TaskType

def lora_fine_tune(model):
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=8,
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=["query", "value"]
    )
    
    peft_model = get_peft_model(model, lora_config)
    peft_model.print_trainable_parameters()
    
    return peft_model

Prompt Engineering

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

class PromptEngine:
    def __init__(self, model_name="gpt2"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name)
        self.model.eval()
    
    def zero_shot(self, task, input_text):
        prompt = f"{task}\n\nInput: {input_text}\nOutput:"
        return self._generate(prompt)
    
    def few_shot(self, task, examples, input_text):
        example_text = "\n".join([
            f"Input: {ex['input']}\nOutput: {ex['output']}" 
            for ex in examples
        ])
        
        prompt = f"{task}\n\n{example_text}\n\nInput: {input_text}\nOutput:"
        return self._generate(prompt)
    
    def chain_of_thought(self, question):
        prompt = f"""Let's think step by step.

Question: {question}

Step 1:"""
        return self._generate(prompt, max_new_tokens=300)
    
    def _generate(self, prompt, max_new_tokens=100):
        inputs = self.tokenizer(prompt, return_tensors="pt")
        
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=0.7,
                do_sample=True,
                top_p=0.9
            )
        
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

# Few-shot example for classification
engine = PromptEngine()

examples = [
    {"input": "This movie is fantastic!", "output": "Positive"},
    {"input": "Terrible waste of time.", "output": "Negative"},
    {"input": "An absolute masterpiece.", "output": "Positive"}
]

result = engine.few_shot(
    "Classify the sentiment of the review as Positive or Negative.",
    examples,
    "The acting was wooden and the plot predictable."
)
print(result)

RAG (Retrieval-Augmented Generation)

import numpy as np
from typing import List, Dict
import torch

class SimpleRAG:
    def __init__(self, embedding_model, generation_model, tokenizer):
        self.embedder = embedding_model
        self.generator = generation_model
        self.tokenizer = tokenizer
        self.documents: List[Dict] = []
        self.embeddings: np.ndarray = None
    
    def add_documents(self, documents: List[Dict]):
        """Add documents to the knowledge base"""
        self.documents.extend(documents)
        
        texts = [doc["text"] for doc in documents]
        new_embeddings = self._encode(texts)
        
        if self.embeddings is None:
            self.embeddings = new_embeddings
        else:
            self.embeddings = np.vstack([self.embeddings, new_embeddings])
    
    def _encode(self, texts: List[str]) -> np.ndarray:
        """Encode texts to embeddings"""
        # Use a sentence transformer in practice
        embeddings = []
        for text in texts:
            inputs = self.tokenizer(text, return_tensors="pt", 
                                    truncation=True, max_length=512)
            with torch.no_grad():
                output = self.generator(**inputs, output_hidden_states=True)
                embedding = output.hidden_states[-1][:, 0, :].numpy()
                embeddings.append(embedding.squeeze())
        return np.array(embeddings)
    
    def retrieve(self, query: str, top_k: int = 3) -> List[Dict]:
        """Retrieve relevant documents"""
        query_embedding = self._encode([query])
        
        # Cosine similarity
        similarities = np.dot(self.embeddings, query_embedding.T).squeeze()
        top_indices = np.argsort(similarities)[-top_k:][::-1]
        
        return [
            {
                "text": self.documents[i]["text"],
                "score": float(similarities[i]),
                "metadata": self.documents[i].get("metadata", {})
            }
            for i in top_indices
        ]
    
    def generate(self, query: str, top_k: int = 3) -> str:
        """Generate answer with retrieved context"""
        retrieved = self.retrieve(query, top_k)
        
        context = "\n\n".join([doc["text"] for doc in retrieved])
        
        prompt = f"""Answer the question based on the context below.

Context:
{context}

Question: {query}
Answer:"""
        
        inputs = self.tokenizer(prompt, return_tensors="pt", 
                                truncation=True, max_length=2048)
        
        with torch.no_grad():
            outputs = self.generator.generate(
                **inputs,
                max_new_tokens=200,
                temperature=0.3
            )
        
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

# Usage
rag = SimpleRAG(embedding_model=None, generation_model=None, tokenizer=None)
rag.add_documents([
    {"text": "Machine learning is a subset of AI...", "metadata": {"source": "wiki"}},
    {"text": "Deep learning uses neural networks...", "metadata": {"source": "textbook"}}
])

answer = rag.generate("What is machine learning?")

Best Practices

Start with pre-trained models – fine-tune rather than train from scratch
Use LoRA/QLoRA for efficient fine-tuning on limited compute
Prompt engineering is often cheaper and faster than fine-tuning
RAG combines the best of retrieval and generation
Evaluate with human judgment – automated metrics don't capture everything