Advanced RAG Techniques

Beyond Basic RAG

Basic RAG retrieves the top-k most similar documents and passes them to the LLM. Advanced techniques improve retrieval quality through query transformation, re-ranking, and multi-stage retrieval pipelines.

Hypothetical Document Embeddings (HyDE)

HyDE generates a hypothetical answer to the query, then uses that answer's embedding for retrieval. This bridges the semantic gap between queries and documents.

DfHyDE Process

Generate hypothetical document d' from query q
Embed d' to get vector v'
Retrieve similar real documents using v'

class HyDERetriever:
    def __init__(self, llm, embedder, vector_store, k: int = 10):
        self.llm = llm
        self.embedder = embedder
        self.vector_store = vector_store
        self.k = k

    def retrieve(self, query: str) -> list[dict]:
        # Step 1: Generate hypothetical document
        hyde_prompt = f"""Write a detailed paragraph that would answer this question.
        Question: {query}
        Answer:"""

        hypothetical_doc = self.llm.generate(hyde_prompt, max_tokens=300)

        # Step 2: Embed the hypothetical document
        hyde_embedding = self.embedder.embed([hypothetical_doc])[0]

        # Step 3: Retrieve real documents using HyDE embedding
        results = self.vector_store.search(
            vector=hyde_embedding.tolist(),
            top_k=self.k
        )

        return results

Multi-Query Retrieval

Generate multiple query variations to increase recall and capture diverse relevant documents.

class MultiQueryRetriever:
    def __init__(self, llm, embedder, vector_store, num_queries: int = 5):
        self.llm = llm
        self.embedder = embedder
        self.vector_store = vector_store
        self.num_queries = num_queries

    def generate_queries(self, original_query: str) -> list[str]:
        prompt = f"""Generate {self.num_queries} different search queries
        that would find information relevant to: {original_query}

        Queries:"""

        response = self.llm.generate(prompt)
        queries = [q.strip() for q in response.split('\n') if q.strip()]
        return queries[:self.num_queries]

    def retrieve(self, query: str, top_k: int = 5) -> list[dict]:
        # Generate multiple queries
        queries = self.generate_queries(query)
        queries.insert(0, query)  # Include original

        # Retrieve for each query
        all_results = {}
        for q in queries:
            embeddings = self.embedder.embed([q])
            results = self.vector_store.search(vector=embeddings[0].tolist(), top_k=top_k)
            for result in results:
                doc_id = result["id"]
                if doc_id not in all_results:
                    all_results[doc_id] = {**result, "score": 0, "query_count": 0}
                all_results[doc_id]["score"] += result.get("score", 0)
                all_results[doc_id]["query_count"] += 1

        # Merge and deduplicate
        merged = sorted(all_results.values(), key=lambda x: -x["score"])
        return merged[:top_k]

Re-ranking

Re-ranking applies a more powerful model to re-score retrieved documents for precise ordering.

Cross-Encoder Re-ranking

from sentence_transformers import CrossEncoder

class CrossEncoderReranker:
    def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
        self.model = CrossEncoder(model_name)

    def rerank(self, query: str, documents: list[dict], top_k: int = 5) -> list[dict]:
        # Create query-document pairs
        pairs = [(query, doc["content"]) for doc in documents]

        # Score each pair
        scores = self.model.predict(pairs)

        # Sort by cross-encoder score
        ranked = sorted(
            zip(documents, scores),
            key=lambda x: -x[1]
        )

        return [
            {**doc, "rerank_score": float(score)}
            for doc, score in ranked[:top_k]
        ]

Cohere Re-rank API

import cohere

class CohereReranker:
    def __init__(self, api_key: str):
        self.client = cohere.Client(api_key)

    def rerank(self, query: str, documents: list[str], top_k: int = 5) -> list[int]:
        response = self.client.rerank(
            query=query,
            documents=documents,
            top_n=top_k,
            model="rerank-english-v3.0"
        )
        return [result.index for result in response.results]

Step-Back Prompting

Retrieve information at a higher level of abstraction before focusing on specifics.

class StepBackRetriever:
    def __init__(self, llm, retriever):
        self.llm = llm
        self.retriever = retriever

    def retrieve(self, query: str) -> dict:
        # Step 1: Generate step-back question
        step_back_prompt = f"""Given this specific question, generate a broader,
        more general question that provides useful background context.

        Specific question: {query}
        Broader question:"""

        broader_query = self.llm.generate(step_back_prompt)

        # Step 2: Retrieve for both queries
        specific_results = self.retriever.retrieve(query, top_k=5)
        broader_results = self.retriever.retrieve(broader_query, top_k=5)

        # Step 3: Merge results
        return {
            "specific": specific_results,
            "broader": broader_results,
            "step_back_query": broader_query
        }

Query Expansion with LLM

class QueryExpander:
    def __init__(self, llm):
        self.llm = llm

    def expand(self, query: str) -> list[str]:
        prompt = f"""Expand this search query into 3-5 related queries
        that together would cover all aspects of the topic.

        Original query: {query}

        Expanded queries:"""

        response = self.llm.generate(prompt)
        expanded = [q.strip() for q in response.split('\n') if q.strip()]
        return expanded

    def decompose(self, query: str) -> list[str]:
        prompt = f"""Break this complex question into simpler sub-questions
        that can each be answered independently.

        Complex question: {query}

        Sub-questions:"""

        response = self.llm.generate(prompt)
        sub_questions = [q.strip() for q in response.split('\n') if q.strip()]
        return sub_questions

Hybrid Search

Combining vector search with keyword search leverages the strengths of both approaches.

class HybridRetriever:
    def __init__(self, vector_store, bm25_index, alpha: float = 0.5):
        self.vector_store = vector_store
        self.bm25_index = bm25_index
        self.alpha = alpha  # Weight for vector vs keyword

    def retrieve(self, query: str, top_k: int = 10) -> list[dict]:
        # Vector search
        vector_results = self.vector_store.search(query, top_k=top_k * 2)

        # BM25 keyword search
        keyword_results = self.bm25_index.search(query, top_k=top_k * 2)

        # Reciprocal Rank Fusion
        fused_scores = {}
        k = 60  # RRF constant

        for rank, result in enumerate(vector_results):
            doc_id = result["id"]
            fused_scores[doc_id] = fused_scores.get(doc_id, 0) + self.alpha / (k + rank + 1)

        for rank, result in enumerate(keyword_results):
            doc_id = result["id"]
            fused_scores[doc_id] = fused_scores.get(doc_id, 0) + (1 - self.alpha) / (k + rank + 1)

        # Sort by fused score
        ranked = sorted(fused_scores.items(), key=lambda x: -x[1])
        return [{"id": doc_id, "score": score} for doc_id, score in ranked[:top_k]]

DfReciprocal Rank Fusion

RRF combines multiple ranking lists:

\text{RRF}(d) = \sum_{r \in R} \frac{1}{k + \text{rank}_r(d)}

Where k is a constant (typically 60) and R is the set of ranking lists.

Context Compression

Reduce retrieved context to only relevant portions, saving tokens and improving LLM focus.

class ContextCompressor:
    def __init__(self, llm):
        self.llm = llm

    def compress(self, query: str, documents: list[str]) -> str:
        prompt = f"""Given the question and documents below, extract ONLY the
        sentences that are relevant to answering the question.
        Remove irrelevant information.

        Question: {query}

        Documents:
        {chr(10).join(documents)}

        Relevant sentences:"""

        return self.llm.generate(prompt)

Advanced RAG techniques significantly improve retrieval quality over basic similarity search. Combining multiple strategies — HyDE for query-document alignment, re-ranking for precision, and hybrid search for recall — produces the best production results.