Beyond Basic RAG
Basic RAG retrieves the top-k most similar documents and passes them to the LLM. Advanced techniques improve retrieval quality through query transformation, re-ranking, and multi-stage retrieval pipelines.
Hypothetical Document Embeddings (HyDE)
HyDE generates a hypothetical answer to the query, then uses that answer's embedding for retrieval. This bridges the semantic gap between queries and documents.
DfHyDE Process
- Generate hypothetical document d' from query q
- Embed d' to get vector v'
- Retrieve similar real documents using v'
class HyDERetriever:
def __init__(self, llm, embedder, vector_store, k: int = 10):
self.llm = llm
self.embedder = embedder
self.vector_store = vector_store
self.k = k
def retrieve(self, query: str) -> list[dict]:
# Step 1: Generate hypothetical document
hyde_prompt = f"""Write a detailed paragraph that would answer this question.
Question: {query}
Answer:"""
hypothetical_doc = self.llm.generate(hyde_prompt, max_tokens=300)
# Step 2: Embed the hypothetical document
hyde_embedding = self.embedder.embed([hypothetical_doc])[0]
# Step 3: Retrieve real documents using HyDE embedding
results = self.vector_store.search(
vector=hyde_embedding.tolist(),
top_k=self.k
)
return results
Multi-Query Retrieval
Generate multiple query variations to increase recall and capture diverse relevant documents.
class MultiQueryRetriever:
def __init__(self, llm, embedder, vector_store, num_queries: int = 5):
self.llm = llm
self.embedder = embedder
self.vector_store = vector_store
self.num_queries = num_queries
def generate_queries(self, original_query: str) -> list[str]:
prompt = f"""Generate {self.num_queries} different search queries
that would find information relevant to: {original_query}
Queries:"""
response = self.llm.generate(prompt)
queries = [q.strip() for q in response.split('\n') if q.strip()]
return queries[:self.num_queries]
def retrieve(self, query: str, top_k: int = 5) -> list[dict]:
# Generate multiple queries
queries = self.generate_queries(query)
queries.insert(0, query) # Include original
# Retrieve for each query
all_results = {}
for q in queries:
embeddings = self.embedder.embed([q])
results = self.vector_store.search(vector=embeddings[0].tolist(), top_k=top_k)
for result in results:
doc_id = result["id"]
if doc_id not in all_results:
all_results[doc_id] = {**result, "score": 0, "query_count": 0}
all_results[doc_id]["score"] += result.get("score", 0)
all_results[doc_id]["query_count"] += 1
# Merge and deduplicate
merged = sorted(all_results.values(), key=lambda x: -x["score"])
return merged[:top_k]
Re-ranking
Re-ranking applies a more powerful model to re-score retrieved documents for precise ordering.
Cross-Encoder Re-ranking
from sentence_transformers import CrossEncoder
class CrossEncoderReranker:
def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"):
self.model = CrossEncoder(model_name)
def rerank(self, query: str, documents: list[dict], top_k: int = 5) -> list[dict]:
# Create query-document pairs
pairs = [(query, doc["content"]) for doc in documents]
# Score each pair
scores = self.model.predict(pairs)
# Sort by cross-encoder score
ranked = sorted(
zip(documents, scores),
key=lambda x: -x[1]
)
return [
{**doc, "rerank_score": float(score)}
for doc, score in ranked[:top_k]
]
Cohere Re-rank API
import cohere
class CohereReranker:
def __init__(self, api_key: str):
self.client = cohere.Client(api_key)
def rerank(self, query: str, documents: list[str], top_k: int = 5) -> list[int]:
response = self.client.rerank(
query=query,
documents=documents,
top_n=top_k,
model="rerank-english-v3.0"
)
return [result.index for result in response.results]
Step-Back Prompting
Retrieve information at a higher level of abstraction before focusing on specifics.
class StepBackRetriever:
def __init__(self, llm, retriever):
self.llm = llm
self.retriever = retriever
def retrieve(self, query: str) -> dict:
# Step 1: Generate step-back question
step_back_prompt = f"""Given this specific question, generate a broader,
more general question that provides useful background context.
Specific question: {query}
Broader question:"""
broader_query = self.llm.generate(step_back_prompt)
# Step 2: Retrieve for both queries
specific_results = self.retriever.retrieve(query, top_k=5)
broader_results = self.retriever.retrieve(broader_query, top_k=5)
# Step 3: Merge results
return {
"specific": specific_results,
"broader": broader_results,
"step_back_query": broader_query
}
Query Expansion with LLM
class QueryExpander:
def __init__(self, llm):
self.llm = llm
def expand(self, query: str) -> list[str]:
prompt = f"""Expand this search query into 3-5 related queries
that together would cover all aspects of the topic.
Original query: {query}
Expanded queries:"""
response = self.llm.generate(prompt)
expanded = [q.strip() for q in response.split('\n') if q.strip()]
return expanded
def decompose(self, query: str) -> list[str]:
prompt = f"""Break this complex question into simpler sub-questions
that can each be answered independently.
Complex question: {query}
Sub-questions:"""
response = self.llm.generate(prompt)
sub_questions = [q.strip() for q in response.split('\n') if q.strip()]
return sub_questions
Hybrid Search
Combining vector search with keyword search leverages the strengths of both approaches.
class HybridRetriever:
def __init__(self, vector_store, bm25_index, alpha: float = 0.5):
self.vector_store = vector_store
self.bm25_index = bm25_index
self.alpha = alpha # Weight for vector vs keyword
def retrieve(self, query: str, top_k: int = 10) -> list[dict]:
# Vector search
vector_results = self.vector_store.search(query, top_k=top_k * 2)
# BM25 keyword search
keyword_results = self.bm25_index.search(query, top_k=top_k * 2)
# Reciprocal Rank Fusion
fused_scores = {}
k = 60 # RRF constant
for rank, result in enumerate(vector_results):
doc_id = result["id"]
fused_scores[doc_id] = fused_scores.get(doc_id, 0) + self.alpha / (k + rank + 1)
for rank, result in enumerate(keyword_results):
doc_id = result["id"]
fused_scores[doc_id] = fused_scores.get(doc_id, 0) + (1 - self.alpha) / (k + rank + 1)
# Sort by fused score
ranked = sorted(fused_scores.items(), key=lambda x: -x[1])
return [{"id": doc_id, "score": score} for doc_id, score in ranked[:top_k]]
DfReciprocal Rank Fusion
RRF combines multiple ranking lists:
\text{RRF}(d) = \sum_{r \in R} \frac{1}{k + \text{rank}_r(d)}
Where k is a constant (typically 60) and R is the set of ranking lists.
Context Compression
Reduce retrieved context to only relevant portions, saving tokens and improving LLM focus.
class ContextCompressor:
def __init__(self, llm):
self.llm = llm
def compress(self, query: str, documents: list[str]) -> str:
prompt = f"""Given the question and documents below, extract ONLY the
sentences that are relevant to answering the question.
Remove irrelevant information.
Question: {query}
Documents:
{chr(10).join(documents)}
Relevant sentences:"""
return self.llm.generate(prompt)
Advanced RAG techniques significantly improve retrieval quality over basic similarity search. Combining multiple strategies β HyDE for query-document alignment, re-ranking for precision, and hybrid search for recall β produces the best production results.