The Evaluation Challenge
Evaluating LLMs is fundamentally different from evaluating traditional ML models. There is no single "correct" output for most prompts, making automated evaluation a complex, multi-dimensional problem.
Evaluation Taxonomy
1. Reference-Based Metrics
These compare model output against a reference answer using predefined formulas.
DfBLEU Score
BLEU measures n-gram overlap between generated and reference text:
\text{BLEU} = BP \cdot \exp\left(\sum_{n=1}^{N} w_n \log p_n\right)
Where BP is the brevity penalty, p_n is the modified n-gram precision, and w_n is the weight (typically 1/N).
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import sacrebleu
class ReferenceMetrics:
def __init__(self):
self.rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
self.smoothing = SmoothingFunction().method1
def bleu(self, reference: str, hypothesis: str) -> float:
ref_tokens = reference.split()
hyp_tokens = hypothesis.split()
return sentence_bleu([ref_tokens], hyp_tokens, smoothing_function=self.smoothing)
def rouge_scores(self, reference: str, hypothesis: str) -> dict:
scores = self.rouge.score(reference, hypothesis)
return {k: v.fmeasure for k, v in scores.items()}
def sacrebleu(self, references: list[str], hypothesis: str) -> float:
return sacrebleu.corpus_bleu([hypothesis], [references]).score
# Usage
metrics = ReferenceMetrics()
ref = "The model achieved 95% accuracy on the benchmark."
hyp = "The model reached 95% accuracy on the benchmark test."
print(f"BLEU: {metrics.bleu(ref, hyp):.3f}")
print(f"ROUGE: {metrics.rouge_scores(ref, hyp)}")
2. Embedding-Based Metrics
Semantic similarity using sentence embeddings captures meaning rather than surface-level overlap.
DfCosine Similarity
Semantic similarity between reference and hypothesis embeddings:
\text{sim}(a, b) = \frac{a \cdot b}{|a| \cdot |b|}
from sentence_transformers import SentenceTransformer
import numpy as np
class SemanticMetrics:
def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
self.model = SentenceTransformer(model_name)
def cosine_similarity(self, text_a: str, text_b: str) -> float:
embeddings = self.model.encode([text_a, text_b])
return float(np.dot(embeddings[0], embeddings[1]) /
(np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1])))
def bertscore(self, references: list[str], hypotheses: list[str]) -> dict:
from bert_score import score
P, R, F1 = score(hypotheses, references, lang="en", verbose=False)
return {"precision": P.mean().item(), "recall": R.mean().item(), "f1": F1.mean().item()}
def semantic_similarity_batch(self, references: list[str], hypotheses: list[str]) -> list[float]:
ref_embeddings = self.model.encode(references)
hyp_embeddings = self.model.encode(hypotheses)
similarities = np.array([
np.dot(r, h) / (np.linalg.norm(r) * np.linalg.norm(h))
for r, h in zip(ref_embeddings, hyp_embeddings)
])
return similarities.tolist()
3. LLM-as-Judge
Using a stronger LLM to evaluate outputs from a target model, providing nuanced quality assessment.
JUDGE_PROMPT = """Rate the following response on a scale of 1-5 for each criterion.
Question: {question}
Response: {response}
Rate on these criteria (1-5):
1. **Accuracy**: Is the information factually correct?
2. **Relevance**: Does it address the question asked?
3. **Completeness**: Does it cover all important aspects?
4. **Clarity**: Is it well-organized and easy to understand?
For each criterion, provide:
- Score (1-5)
- Brief justification
Format your response as JSON:
{{
"accuracy": {{"score": N, "reason": "..."}},
"relevance": {{"score": N, "reason": "..."}},
"completeness": {{"score": N, "reason": "..."}},
"clarity": {{"score": N, "reason": "..."}}
}}"""
class LLMJudge:
def __init__(self, judge_model):
self.model = judge_model
def evaluate(self, question: str, response: str) -> dict:
prompt = JUDGE_PROMPT.format(question=question, response=response)
result = self.model.generate(prompt)
return json.loads(result)
def pairwise_compare(self, question: str, response_a: str, response_b: str) -> str:
prompt = f"""Compare these two responses to the question: {question}
Response A: {response_a}
Response B: {response_b}
Which response is better? Answer "A" or "B" or "tie"."""
return self.model.generate(prompt)
Benchmark Datasets
| Benchmark | Domain | Samples | Metrics |
|---|---|---|---|
| MMLU | Knowledge, reasoning | 14K | Accuracy |
| HumanEval | Code generation | 164 | Pass@k |
| GSM8K | Math reasoning | 8.5K | Accuracy |
| TruthfulQA | Truthfulness | 817 | % truthful |
| MT-Bench | Multi-turn chat | 80 | LLM-judge score |
| AlpacaEval | Instruction following | 805 | Win rate vs GPT-4 |
Composite Evaluation Framework
from dataclasses import dataclass
from typing import Optional
@dataclass
class EvalResult:
sample_id: str
question: str
response: str
reference: Optional[str]
metrics: dict
judge_scores: Optional[dict]
human_score: Optional[float]
class LLMEvaluator:
def __init__(self, judge_model=None):
self.ref_metrics = ReferenceMetrics()
self.semantic = SemanticMetrics()
self.judge = LLMJudge(judge_model) if judge_model else None
def evaluate_single(self, question: str, response: str,
reference: str = None) -> EvalResult:
metrics = {}
if reference:
metrics["bleu"] = self.ref_metrics.bleu(reference, response)
metrics.update(self.ref_metrics.rouge_scores(reference, response))
metrics["cosine_sim"] = self.semantic.cosine_similarity(reference, response)
judge_scores = None
if self.judge:
judge_scores = self.judge.evaluate(question, response)
return EvalResult(
sample_id=hashlib.md5(question.encode()).hexdigest()[:8],
question=question,
response=response,
reference=reference,
metrics=metrics,
judge_scores=judge_scores,
human_score=None
)
def evaluate_batch(self, dataset: list[dict]) -> list[EvalResult]:
results = []
for sample in dataset:
result = self.evaluate_single(
question=sample["question"],
response=sample["response"],
reference=sample.get("reference")
)
results.append(result)
return results
def aggregate(self, results: list[EvalResult]) -> dict:
all_metrics = {}
for r in results:
for k, v in r.metrics.items():
if k not in all_metrics:
all_metrics[k] = []
all_metrics[k].append(v)
return {k: {"mean": np.mean(v), "std": np.std(v), "min": np.min(v), "max": np.max(v)}
for k, v in all_metrics.items()}
Evaluation Best Practices
| Practice | Why It Matters |
|---|---|
| Stratified evaluation | Ensure performance across subgroups |
| Adversarial testing | Test edge cases and failure modes |
| Temporal evaluation | Track quality over model updates |
| Multi-metric scoring | Avoid optimizing for single metric |
| Human validation | Calibrate automated metrics |
Comprehensive evaluation combines automated metrics with human judgment to ensure LLM quality across diverse use cases.