πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

RAG Evaluation

RAGopsEvaluation Metrics🟒 Free Lesson

Advertisement

RAG Evaluation Framework

Evaluating RAG systems requires assessing multiple components: retrieval quality, answer faithfulness, and overall usefulness. No single metric captures the full picture.

Evaluation Metrics

Faithfulness

Measures whether the generated answer is grounded in the retrieved context.

DfFaithfulness Score

The fraction of claims in the answer that are supported by the context:

\text{Faithfulness} = \frac{|\text{Supported Claims}|}{|\text{Total Claims}|}

class FaithfulnessEvaluator:
    def __init__(self, llm):
        self.llm = llm

    def evaluate(self, question: str, answer: str, context: str) -> dict:
        # Step 1: Extract claims from answer
        claims_prompt = f"""Extract all factual claims from this answer.
        Return each claim as a separate line.

        Answer: {answer}

        Claims:"""

        claims_text = self.llm.generate(claims_prompt)
        claims = [c.strip() for c in claims_text.split('\n') if c.strip()]

        # Step 2: Verify each claim against context
        supported = 0
        for claim in claims:
            verify_prompt = f"""Does the following context support this claim?
            Answer "yes" or "no" only.

            Context: {context}
            Claim: {claim}
            Supported:"""

            response = self.llm.generate(verify_prompt).strip().lower()
            if response.startswith("yes"):
                supported += 1

        faithfulness = supported / len(claims) if claims else 0

        return {
            "faithfulness": faithfulness,
            "total_claims": len(claims),
            "supported_claims": supported
        }

Answer Relevance

Measures whether the answer addresses the question.

class AnswerRelevanceEvaluator:
    def __init__(self, llm, embedder):
        self.llm = llm
        self.embedder = embedder

    def evaluate(self, question: str, answer: str) -> float:
        # Generate questions that the answer would address
        prompt = f"""Generate 3 questions that this answer would be a good response to.

        Answer: {answer}

        Questions:"""

        generated_questions = self.llm.generate(prompt)
        questions = [q.strip() for q in generated_questions.split('\n') if q.strip()]

        # Compute semantic similarity between original and generated questions
        all_questions = [question] + questions
        embeddings = self.embedder.embed(all_questions)

        # Average similarity between original question and generated questions
        similarities = []
        for i in range(1, len(embeddings)):
            sim = self._cosine_similarity(embeddings[0], embeddings[i])
            similarities.append(sim)

        return sum(similarities) / len(similarities) if similarities else 0

    def _cosine_similarity(self, a, b):
        import numpy as np
        return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

Context Precision

Measures whether retrieved documents are relevant to the question.

\text{Context Precision@K} = \frac{\sum_{k=1}^{K} \text{Precision@k} \cdot \text{Relevancy}k}{\sum{k=1}^{K} \text{Relevancy}_k}

class ContextPrecisionEvaluator:
    def evaluate(self, question: str, retrieved_docs: list[dict]) -> float:
        # Score each document for relevance
        precisions = []
        relevant_count = 0

        for i, doc in enumerate(retrieved_docs):
            is_relevant = self._is_relevant(question, doc["content"])
            if is_relevant:
                relevant_count += 1
            precision_at_k = relevant_count / (i + 1)
            precisions.append(precision_at_k * is_relevant)

        return sum(precisions) / len(precisions) if precisions else 0

    def _is_relevant(self, question: str, content: str) -> int:
        # Simplified; in production use LLM or embedding similarity
        return 1  # Placeholder

Context Recall

Measures whether the retrieved context contains the information needed to answer the question.

class ContextRecallEvaluator:
    def __init__(self, llm):
        self.llm = llm

    def evaluate(self, question: str, answer: str, context: str) -> float:
        # Extract key statements from the answer
        statements_prompt = f"""Extract key factual statements from this answer.

        Answer: {answer}

        Statements:"""

        statements_text = self.llm.generate(statements_prompt)
        statements = [s.strip() for s in statements_text.split('\n') if s.strip()]

        # Check if each statement is supported by context
        supported = 0
        for statement in statements:
            check_prompt = f"""Does this context contain information that supports the statement?
            Answer "yes" or "no".

            Context: {context}
            Statement: {statement}
            Answer:"""

            response = self.llm.generate(check_prompt).strip().lower()
            if response.startswith("yes"):
                supported += 1

        return supported / len(statements) if statements else 0

RAGAS Framework

RAGAS provides a comprehensive evaluation framework for RAG systems.

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall
)
from datasets import Dataset

# Prepare evaluation dataset
eval_data = {
    "question": questions,
    "answer": generated_answers,
    "contexts": retrieved_contexts,
    "ground_truth": reference_answers
}

dataset = Dataset.from_dict(eval_data)

# Run RAGAS evaluation
result = evaluate(
    dataset,
    metrics=[
        faithfulness,
        answer_relevancy,
        context_precision,
        context_recall
    ]
)

print(result)
# {'faithfulness': 0.85, 'answer_relevancy': 0.92,
#  'context_precision': 0.78, 'context_recall': 0.88}

End-to-End Evaluation

MetricMeasuresRangeTool
FaithfulnessAnswer grounded in context0-1RAGAS, DeepEval
Answer RelevanceAnswer addresses question0-1RAGAS, custom
Context PrecisionRetrieved docs are relevant0-1RAGAS, custom
Context RecallNeeded info is retrieved0-1RAGAS, custom
Answer CorrectnessAnswer matches ground truth0-1Exact match, LLM

Evaluation Dataset Construction

class EvalDatasetBuilder:
    def __init__(self, llm, documents: list[str]):
        self.llm = llm
        self.documents = documents

    def generate_qa_pairs(self, num_pairs: int = 100) -> list[dict]:
        pairs = []
        for doc in self.documents[:num_pairs]:
            prompt = f"""Generate a question and its answer based on this document.
            Include the supporting context.

            Document: {doc}

            Format:
            Question: ...
            Answer: ...
            Context: ..."""

            response = self.llm.generate(prompt)
            # Parse response into structured format
            pairs.append(self._parse_qa(response, doc))
        return pairs

    def _parse_qa(self, response: str, source_doc: str) -> dict:
        lines = response.strip().split('\n')
        result = {"source": source_doc}
        for line in lines:
            if line.startswith("Question:"):
                result["question"] = line.split(":", 1)[1].strip()
            elif line.startswith("Answer:"):
                result["answer"] = line.split(":", 1)[1].strip()
            elif line.startswith("Context:"):
                result["context"] = line.split(":", 1)[1].strip()
        return result

Comprehensive RAG evaluation requires measuring both retrieval and generation quality. Regular evaluation against curated test sets ensures system reliability as components evolve.

⭐

Premium Content

RAG Evaluation

Unlock this lesson and 900+ advanced tutorials with a Premium plan.

🎯End-to-end Projects
πŸ’ΌInterview Prep
πŸ“œCertificates
🀝Community Access

Already a member? Log in

Need Expert AI Ops & LLM Ops Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement