RAG Evaluation

RAG Evaluation Framework

Evaluating RAG systems requires assessing multiple components: retrieval quality, answer faithfulness, and overall usefulness. No single metric captures the full picture.

Evaluation Metrics

Faithfulness

Measures whether the generated answer is grounded in the retrieved context.

DfFaithfulness Score

The fraction of claims in the answer that are supported by the context:

\text{Faithfulness} = \frac{|\text{Supported Claims}|}{|\text{Total Claims}|}

class FaithfulnessEvaluator:
    def __init__(self, llm):
        self.llm = llm

    def evaluate(self, question: str, answer: str, context: str) -> dict:
        # Step 1: Extract claims from answer
        claims_prompt = f"""Extract all factual claims from this answer.
        Return each claim as a separate line.

        Answer: {answer}

        Claims:"""

        claims_text = self.llm.generate(claims_prompt)
        claims = [c.strip() for c in claims_text.split('\n') if c.strip()]

        # Step 2: Verify each claim against context
        supported = 0
        for claim in claims:
            verify_prompt = f"""Does the following context support this claim?
            Answer "yes" or "no" only.

            Context: {context}
            Claim: {claim}
            Supported:"""

            response = self.llm.generate(verify_prompt).strip().lower()
            if response.startswith("yes"):
                supported += 1

        faithfulness = supported / len(claims) if claims else 0

        return {
            "faithfulness": faithfulness,
            "total_claims": len(claims),
            "supported_claims": supported
        }

Answer Relevance

Measures whether the answer addresses the question.

class AnswerRelevanceEvaluator:
    def __init__(self, llm, embedder):
        self.llm = llm
        self.embedder = embedder

    def evaluate(self, question: str, answer: str) -> float:
        # Generate questions that the answer would address
        prompt = f"""Generate 3 questions that this answer would be a good response to.

        Answer: {answer}

        Questions:"""

        generated_questions = self.llm.generate(prompt)
        questions = [q.strip() for q in generated_questions.split('\n') if q.strip()]

        # Compute semantic similarity between original and generated questions
        all_questions = [question] + questions
        embeddings = self.embedder.embed(all_questions)

        # Average similarity between original question and generated questions
        similarities = []
        for i in range(1, len(embeddings)):
            sim = self._cosine_similarity(embeddings[0], embeddings[i])
            similarities.append(sim)

        return sum(similarities) / len(similarities) if similarities else 0

    def _cosine_similarity(self, a, b):
        import numpy as np
        return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

Context Precision

Measures whether retrieved documents are relevant to the question.

\text{Context Precision@K} = \frac{\sum_{k=1}^{K} \text{Precision@k} \cdot \text{Relevancy}k}{\sum{k=1}^{K} \text{Relevancy}_k}

class ContextPrecisionEvaluator:
    def evaluate(self, question: str, retrieved_docs: list[dict]) -> float:
        # Score each document for relevance
        precisions = []
        relevant_count = 0

        for i, doc in enumerate(retrieved_docs):
            is_relevant = self._is_relevant(question, doc["content"])
            if is_relevant:
                relevant_count += 1
            precision_at_k = relevant_count / (i + 1)
            precisions.append(precision_at_k * is_relevant)

        return sum(precisions) / len(precisions) if precisions else 0

    def _is_relevant(self, question: str, content: str) -> int:
        # Simplified; in production use LLM or embedding similarity
        return 1  # Placeholder

Context Recall

Measures whether the retrieved context contains the information needed to answer the question.

class ContextRecallEvaluator:
    def __init__(self, llm):
        self.llm = llm

    def evaluate(self, question: str, answer: str, context: str) -> float:
        # Extract key statements from the answer
        statements_prompt = f"""Extract key factual statements from this answer.

        Answer: {answer}

        Statements:"""

        statements_text = self.llm.generate(statements_prompt)
        statements = [s.strip() for s in statements_text.split('\n') if s.strip()]

        # Check if each statement is supported by context
        supported = 0
        for statement in statements:
            check_prompt = f"""Does this context contain information that supports the statement?
            Answer "yes" or "no".

            Context: {context}
            Statement: {statement}
            Answer:"""

            response = self.llm.generate(check_prompt).strip().lower()
            if response.startswith("yes"):
                supported += 1

        return supported / len(statements) if statements else 0

RAGAS Framework

RAGAS provides a comprehensive evaluation framework for RAG systems.

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall
)
from datasets import Dataset

# Prepare evaluation dataset
eval_data = {
    "question": questions,
    "answer": generated_answers,
    "contexts": retrieved_contexts,
    "ground_truth": reference_answers
}

dataset = Dataset.from_dict(eval_data)

# Run RAGAS evaluation
result = evaluate(
    dataset,
    metrics=[
        faithfulness,
        answer_relevancy,
        context_precision,
        context_recall
    ]
)

print(result)
# {'faithfulness': 0.85, 'answer_relevancy': 0.92,
#  'context_precision': 0.78, 'context_recall': 0.88}

End-to-End Evaluation

Metric	Measures	Range	Tool
Faithfulness	Answer grounded in context	0-1	RAGAS, DeepEval
Answer Relevance	Answer addresses question	0-1	RAGAS, custom
Context Precision	Retrieved docs are relevant	0-1	RAGAS, custom
Context Recall	Needed info is retrieved	0-1	RAGAS, custom
Answer Correctness	Answer matches ground truth	0-1	Exact match, LLM

Evaluation Dataset Construction

class EvalDatasetBuilder:
    def __init__(self, llm, documents: list[str]):
        self.llm = llm
        self.documents = documents

    def generate_qa_pairs(self, num_pairs: int = 100) -> list[dict]:
        pairs = []
        for doc in self.documents[:num_pairs]:
            prompt = f"""Generate a question and its answer based on this document.
            Include the supporting context.

            Document: {doc}

            Format:
            Question: ...
            Answer: ...
            Context: ..."""

            response = self.llm.generate(prompt)
            # Parse response into structured format
            pairs.append(self._parse_qa(response, doc))
        return pairs

    def _parse_qa(self, response: str, source_doc: str) -> dict:
        lines = response.strip().split('\n')
        result = {"source": source_doc}
        for line in lines:
            if line.startswith("Question:"):
                result["question"] = line.split(":", 1)[1].strip()
            elif line.startswith("Answer:"):
                result["answer"] = line.split(":", 1)[1].strip()
            elif line.startswith("Context:"):
                result["context"] = line.split(":", 1)[1].strip()
        return result

Comprehensive RAG evaluation requires measuring both retrieval and generation quality. Regular evaluation against curated test sets ensures system reliability as components evolve.