Text Summarization Models
Modern summarization leverages pre-trained transformer models fine-tuned for text summarization. Each architecture brings unique pre-training objectives tailored for generation tasks.
Model Comparison
| Model | Pre-training | Architecture | Max Length | Best For |
|---|---|---|---|---|
| BART | Denoising | Encoder-Decoder | 1024 | General summarization |
| T5 | Span corruption | Encoder-Decoder | 512 | Multi-task |
| PEGASUS | Gap sentences | Encoder-Decoder | 1024 | Summarization-specific |
| LED | Longformer | Encoder-Decoder | 16384 | Long documents |
| ProphetNet | Future token prediction | Encoder-Decoder | 512 | Diverse generation |
| GRep | Generative-retrieval | Hybrid | 1024 | Open-domain |
BART for Summarization
BART (Bidirectional and Auto-Regressive Transformers) uses a denoising autoencoder pre-training objective.
DfBART Denoising Objective
BART corruption involves:
- Token deletion (random %)
- Text infilling (replace spans with
[MASK]) - Sentence permutation
- Document rotation
from transformers import BartForConditionalGeneration, BartTokenizer
class BARTSummarizer:
def __init__(self, model_name="facebook/bart-large-cnn"):
self.tokenizer = BartTokenizer.from_pretrained(model_name)
self.model = BartForConditionalGeneration.from_pretrained(model_name)
def summarize(self, text, max_length=150, min_length=40, num_beams=4):
inputs = self.tokenizer(
text,
max_length=1024,
truncation=True,
return_tensors="pt"
)
summary_ids = self.model.generate(
inputs["input_ids"],
max_length=max_length,
min_length=min_length,
num_beams=num_beams,
length_penalty=2.0,
no_repeat_ngram_size=3,
early_stopping=True
)
return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
def summarize_batch(self, texts, batch_size=8):
all_summaries = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i+batch_size]
inputs = self.tokenizer(
batch,
max_length=1024,
truncation=True,
padding=True,
return_tensors="pt"
)
summary_ids = self.model.generate(
**inputs,
max_length=150,
num_beams=4
)
summaries = self.tokenizer.batch_decode(
summary_ids, skip_special_tokens=True
)
all_summaries.extend(summaries)
return all_summaries
summarizer = BARTSummarizer()
T5 for Summarization
T5 (Text-to-Text Transfer Transformer) frames all NLP tasks as text-to-text.
from transformers import T5ForConditionalGeneration, T5Tokenizer
class T5Summarizer:
def __init__(self, model_name="t5-small"):
self.tokenizer = T5Tokenizer.from_pretrained(model_name)
self.model = T5ForConditionalGeneration.from_pretrained(model_name)
def summarize(self, text, max_length=150):
# T5 requires a task prefix
input_text = f"summarize: {text}"
inputs = self.tokenizer(
input_text,
max_length=512,
truncation=True,
return_tensors="pt"
)
outputs = self.model.generate(
**inputs,
max_length=max_length,
num_beams=4,
length_penalty=2.0,
early_stopping=True
)
return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# T5 task prefixes
t5 = T5Summarizer()
print(t5.summarize("Your long article text here..."))
# Other T5 tasks with prefixes:
# Translation: "translate English to French: ..."
# Classification: "sst2 sentence: ..."
# Question answering: "question: ... context: ..."
PEGASUS for Summarization
PEGASUS is pre-trained with Gap Sentences Generation (GSG) β masking entire sentences that are important.
DfPEGASUS Pre-training Objective
Where G is the set of gap sentences (important sentences selected by ROUGE against remaining text).
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
class PegasusSummarizer:
def __init__(self, model_name="google/pegasus-xsum"):
self.tokenizer = PegasusTokenizer.from_pretrained(model_name)
self.model = PegasusForConditionalGeneration.from_pretrained(model_name)
def summarize(self, text, max_length=128):
inputs = self.tokenizer(
text,
max_length=1024,
truncation=True,
return_tensors="pt"
)
outputs = self.model.generate(
**inputs,
max_length=max_length,
num_beams=8,
length_penalty=0.6,
early_stopping=True
)
return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# XSum is more abstractive, CNN/DM is more extractive
pegasus_xsum = PegasusSummarizer("google/pegasus-xsum")
pegasus_cnn = PegasusSummarizer("google/pegasus-cnn-dailymail")
Long Document Summarization
from transformers import LongformerForConditionalGeneration, LongformerTokenizer
class LongDocSummarizer:
def __init__(self):
self.tokenizer = LongformerTokenizer.from_pretrained("allenai/led-base-16384")
self.model = LongformerForConditionalGeneration.from_pretrained(
"allenai/led-base-16384"
)
def summarize(self, text, max_length=256):
inputs = self.tokenizer(
text,
max_length=16384,
truncation=True,
return_tensors="pt"
)
global_attention_mask = torch.zeros_like(inputs.input_ids)
global_attention_mask[:, 0] = 1 # Global attention on [CLS]
outputs = self.model.generate(
**inputs,
global_attention_mask=global_attention_mask,
max_length=max_length,
num_beams=4
)
return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
Fine-Tuning for Summarization
from datasets import load_dataset
from transformers import (
AutoModelForSeq2SeqLM,
AutoTokenizer,
Seq2SeqTrainingArguments,
Seq2SeqTrainer,
DataCollatorForSeq2Seq,
)
def fine_tune_summarization(model_name, dataset_name="cnn_dailymail"):
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset = load_dataset(dataset_name, "3.0.0")
def preprocess(examples):
inputs = ["summarize: " + doc for doc in examples["article"]]
model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
labels = tokenizer(
examples["highlights"],
max_length=128,
truncation=True
)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
tokenized = dataset.map(preprocess, batched=True)
training_args = Seq2SeqTrainingArguments(
output_dir="./summarization_model",
num_train_epochs=3,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
warmup_steps=500,
weight_decay=0.01,
logging_dir="./logs",
evaluation_strategy="epoch",
save_strategy="epoch",
predict_with_generate=True,
fp16=True,
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=tokenized["train"],
eval_dataset=tokenized["validation"],
data_collator=data_collator,
)
trainer.train()
return model
PEGASUS achieves the best ROUGE scores because its pre-training objective (gap sentence generation) directly mimics the summarization task. BART excels at abstractive summarization due to its flexible denoising objective.