RAG Evaluation Framework
Evaluating RAG systems requires assessing multiple components: retrieval quality, answer faithfulness, and overall usefulness. No single metric captures the full picture.
Evaluation Metrics
Faithfulness
Measures whether the generated answer is grounded in the retrieved context.
DfFaithfulness Score
The fraction of claims in the answer that are supported by the context:
\text{Faithfulness} = \frac{|\text{Supported Claims}|}{|\text{Total Claims}|}
class FaithfulnessEvaluator:
def __init__(self, llm):
self.llm = llm
def evaluate(self, question: str, answer: str, context: str) -> dict:
# Step 1: Extract claims from answer
claims_prompt = f"""Extract all factual claims from this answer.
Return each claim as a separate line.
Answer: {answer}
Claims:"""
claims_text = self.llm.generate(claims_prompt)
claims = [c.strip() for c in claims_text.split('\n') if c.strip()]
# Step 2: Verify each claim against context
supported = 0
for claim in claims:
verify_prompt = f"""Does the following context support this claim?
Answer "yes" or "no" only.
Context: {context}
Claim: {claim}
Supported:"""
response = self.llm.generate(verify_prompt).strip().lower()
if response.startswith("yes"):
supported += 1
faithfulness = supported / len(claims) if claims else 0
return {
"faithfulness": faithfulness,
"total_claims": len(claims),
"supported_claims": supported
}
Answer Relevance
Measures whether the answer addresses the question.
class AnswerRelevanceEvaluator:
def __init__(self, llm, embedder):
self.llm = llm
self.embedder = embedder
def evaluate(self, question: str, answer: str) -> float:
# Generate questions that the answer would address
prompt = f"""Generate 3 questions that this answer would be a good response to.
Answer: {answer}
Questions:"""
generated_questions = self.llm.generate(prompt)
questions = [q.strip() for q in generated_questions.split('\n') if q.strip()]
# Compute semantic similarity between original and generated questions
all_questions = [question] + questions
embeddings = self.embedder.embed(all_questions)
# Average similarity between original question and generated questions
similarities = []
for i in range(1, len(embeddings)):
sim = self._cosine_similarity(embeddings[0], embeddings[i])
similarities.append(sim)
return sum(similarities) / len(similarities) if similarities else 0
def _cosine_similarity(self, a, b):
import numpy as np
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
Context Precision
Measures whether retrieved documents are relevant to the question.
\text{Context Precision@K} = \frac{\sum_{k=1}^{K} \text{Precision@k} \cdot \text{Relevancy}k}{\sum{k=1}^{K} \text{Relevancy}_k}
class ContextPrecisionEvaluator:
def evaluate(self, question: str, retrieved_docs: list[dict]) -> float:
# Score each document for relevance
precisions = []
relevant_count = 0
for i, doc in enumerate(retrieved_docs):
is_relevant = self._is_relevant(question, doc["content"])
if is_relevant:
relevant_count += 1
precision_at_k = relevant_count / (i + 1)
precisions.append(precision_at_k * is_relevant)
return sum(precisions) / len(precisions) if precisions else 0
def _is_relevant(self, question: str, content: str) -> int:
# Simplified; in production use LLM or embedding similarity
return 1 # Placeholder
Context Recall
Measures whether the retrieved context contains the information needed to answer the question.
class ContextRecallEvaluator:
def __init__(self, llm):
self.llm = llm
def evaluate(self, question: str, answer: str, context: str) -> float:
# Extract key statements from the answer
statements_prompt = f"""Extract key factual statements from this answer.
Answer: {answer}
Statements:"""
statements_text = self.llm.generate(statements_prompt)
statements = [s.strip() for s in statements_text.split('\n') if s.strip()]
# Check if each statement is supported by context
supported = 0
for statement in statements:
check_prompt = f"""Does this context contain information that supports the statement?
Answer "yes" or "no".
Context: {context}
Statement: {statement}
Answer:"""
response = self.llm.generate(check_prompt).strip().lower()
if response.startswith("yes"):
supported += 1
return supported / len(statements) if statements else 0
RAGAS Framework
RAGAS provides a comprehensive evaluation framework for RAG systems.
from ragas import evaluate
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_precision,
context_recall
)
from datasets import Dataset
# Prepare evaluation dataset
eval_data = {
"question": questions,
"answer": generated_answers,
"contexts": retrieved_contexts,
"ground_truth": reference_answers
}
dataset = Dataset.from_dict(eval_data)
# Run RAGAS evaluation
result = evaluate(
dataset,
metrics=[
faithfulness,
answer_relevancy,
context_precision,
context_recall
]
)
print(result)
# {'faithfulness': 0.85, 'answer_relevancy': 0.92,
# 'context_precision': 0.78, 'context_recall': 0.88}
End-to-End Evaluation
| Metric | Measures | Range | Tool |
|---|---|---|---|
| Faithfulness | Answer grounded in context | 0-1 | RAGAS, DeepEval |
| Answer Relevance | Answer addresses question | 0-1 | RAGAS, custom |
| Context Precision | Retrieved docs are relevant | 0-1 | RAGAS, custom |
| Context Recall | Needed info is retrieved | 0-1 | RAGAS, custom |
| Answer Correctness | Answer matches ground truth | 0-1 | Exact match, LLM |
Evaluation Dataset Construction
class EvalDatasetBuilder:
def __init__(self, llm, documents: list[str]):
self.llm = llm
self.documents = documents
def generate_qa_pairs(self, num_pairs: int = 100) -> list[dict]:
pairs = []
for doc in self.documents[:num_pairs]:
prompt = f"""Generate a question and its answer based on this document.
Include the supporting context.
Document: {doc}
Format:
Question: ...
Answer: ...
Context: ..."""
response = self.llm.generate(prompt)
# Parse response into structured format
pairs.append(self._parse_qa(response, doc))
return pairs
def _parse_qa(self, response: str, source_doc: str) -> dict:
lines = response.strip().split('\n')
result = {"source": source_doc}
for line in lines:
if line.startswith("Question:"):
result["question"] = line.split(":", 1)[1].strip()
elif line.startswith("Answer:"):
result["answer"] = line.split(":", 1)[1].strip()
elif line.startswith("Context:"):
result["context"] = line.split(":", 1)[1].strip()
return result
Comprehensive RAG evaluation requires measuring both retrieval and generation quality. Regular evaluation against curated test sets ensures system reliability as components evolve.