Advanced NLP with Transformers
Transformers revolutionized NLP. Learn the architecture, attention mechanisms, and practical techniques for fine-tuning, prompting, and retrieval-augmented generation.
Transformer Architecture
Self-Attention from Scratch
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
class SelfAttention(nn.Module):
def __init__(self, embed_dim, num_heads):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == embed_dim
self.q_linear = nn.Linear(embed_dim, embed_dim)
self.k_linear = nn.Linear(embed_dim, embed_dim)
self.v_linear = nn.Linear(embed_dim, embed_dim)
self.out_linear = nn.Linear(embed_dim, embed_dim)
def forward(self, x, mask=None):
batch_size, seq_len, _ = x.size()
# Linear projections and reshape
Q = self.q_linear(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
K = self.k_linear(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
V = self.v_linear(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
# Scaled dot-product attention
scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
if mask is not None:
scores = scores.masked_fill(mask == 0, float('-inf'))
attention = F.softmax(scores, dim=-1)
context = torch.matmul(attention, V)
# Concatenate heads
context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.embed_dim)
return self.out_linear(context)
class TransformerBlock(nn.Module):
def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
super().__init__()
self.attention = SelfAttention(embed_dim, num_heads)
self.norm1 = nn.LayerNorm(embed_dim)
self.norm2 = nn.LayerNorm(embed_dim)
self.ff = nn.Sequential(
nn.Linear(embed_dim, ff_dim),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(ff_dim, embed_dim),
nn.Dropout(dropout)
)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask=None):
# Self-attention with residual
attn_out = self.attention(x, mask)
x = self.norm1(x + self.dropout(attn_out))
# Feed-forward with residual
ff_out = self.ff(x)
x = self.norm2(x + ff_out)
return x
# Test
block = TransformerBlock(embed_dim=512, num_heads=8, ff_dim=2048)
x = torch.randn(2, 10, 512) # batch=2, seq_len=10
out = block(x)
print(f"Input shape: {x.shape}, Output shape: {out.shape}")
Fine-Tuning with Hugging Face
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
TrainingArguments,
Trainer
)
from datasets import load_dataset
import numpy as np
def fine_tune_classifier():
# Load model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
model_name, num_labels=2
)
# Load and preprocess dataset
dataset = load_dataset("imdb")
def tokenize_function(examples):
return tokenizer(
examples["text"],
padding="max_length",
truncation=True,
max_length=512
)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Training arguments
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=100,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
fp16=True
)
# Metrics
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
accuracy = (predictions == labels).mean()
return {"accuracy": accuracy}
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
compute_metrics=compute_metrics
)
trainer.train()
return model, tokenizer
# LoRA fine-tuning for efficiency
from peft import LoraConfig, get_peft_model, TaskType
def lora_fine_tune(model):
lora_config = LoraConfig(
task_type=TaskType.SEQ_CLS,
r=8,
lora_alpha=32,
lora_dropout=0.1,
target_modules=["query", "value"]
)
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()
return peft_model
Prompt Engineering
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
class PromptEngine:
def __init__(self, model_name="gpt2"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForCausalLM.from_pretrained(model_name)
self.model.eval()
def zero_shot(self, task, input_text):
prompt = f"{task}\n\nInput: {input_text}\nOutput:"
return self._generate(prompt)
def few_shot(self, task, examples, input_text):
example_text = "\n".join([
f"Input: {ex['input']}\nOutput: {ex['output']}"
for ex in examples
])
prompt = f"{task}\n\n{example_text}\n\nInput: {input_text}\nOutput:"
return self._generate(prompt)
def chain_of_thought(self, question):
prompt = f"""Let's think step by step.
Question: {question}
Step 1:"""
return self._generate(prompt, max_new_tokens=300)
def _generate(self, prompt, max_new_tokens=100):
inputs = self.tokenizer(prompt, return_tensors="pt")
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=0.7,
do_sample=True,
top_p=0.9
)
return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# Few-shot example for classification
engine = PromptEngine()
examples = [
{"input": "This movie is fantastic!", "output": "Positive"},
{"input": "Terrible waste of time.", "output": "Negative"},
{"input": "An absolute masterpiece.", "output": "Positive"}
]
result = engine.few_shot(
"Classify the sentiment of the review as Positive or Negative.",
examples,
"The acting was wooden and the plot predictable."
)
print(result)
RAG (Retrieval-Augmented Generation)
import numpy as np
from typing import List, Dict
import torch
class SimpleRAG:
def __init__(self, embedding_model, generation_model, tokenizer):
self.embedder = embedding_model
self.generator = generation_model
self.tokenizer = tokenizer
self.documents: List[Dict] = []
self.embeddings: np.ndarray = None
def add_documents(self, documents: List[Dict]):
"""Add documents to the knowledge base"""
self.documents.extend(documents)
texts = [doc["text"] for doc in documents]
new_embeddings = self._encode(texts)
if self.embeddings is None:
self.embeddings = new_embeddings
else:
self.embeddings = np.vstack([self.embeddings, new_embeddings])
def _encode(self, texts: List[str]) -> np.ndarray:
"""Encode texts to embeddings"""
# Use a sentence transformer in practice
embeddings = []
for text in texts:
inputs = self.tokenizer(text, return_tensors="pt",
truncation=True, max_length=512)
with torch.no_grad():
output = self.generator(**inputs, output_hidden_states=True)
embedding = output.hidden_states[-1][:, 0, :].numpy()
embeddings.append(embedding.squeeze())
return np.array(embeddings)
def retrieve(self, query: str, top_k: int = 3) -> List[Dict]:
"""Retrieve relevant documents"""
query_embedding = self._encode([query])
# Cosine similarity
similarities = np.dot(self.embeddings, query_embedding.T).squeeze()
top_indices = np.argsort(similarities)[-top_k:][::-1]
return [
{
"text": self.documents[i]["text"],
"score": float(similarities[i]),
"metadata": self.documents[i].get("metadata", {})
}
for i in top_indices
]
def generate(self, query: str, top_k: int = 3) -> str:
"""Generate answer with retrieved context"""
retrieved = self.retrieve(query, top_k)
context = "\n\n".join([doc["text"] for doc in retrieved])
prompt = f"""Answer the question based on the context below.
Context:
{context}
Question: {query}
Answer:"""
inputs = self.tokenizer(prompt, return_tensors="pt",
truncation=True, max_length=2048)
with torch.no_grad():
outputs = self.generator.generate(
**inputs,
max_new_tokens=200,
temperature=0.3
)
return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# Usage
rag = SimpleRAG(embedding_model=None, generation_model=None, tokenizer=None)
rag.add_documents([
{"text": "Machine learning is a subset of AI...", "metadata": {"source": "wiki"}},
{"text": "Deep learning uses neural networks...", "metadata": {"source": "textbook"}}
])
answer = rag.generate("What is machine learning?")
Best Practices
- Start with pre-trained models β fine-tune rather than train from scratch
- Use LoRA/QLoRA for efficient fine-tuning on limited compute
- Prompt engineering is often cheaper and faster than fine-tuning
- RAG combines the best of retrieval and generation
- Evaluate with human judgment β automated metrics don't capture everything