Information RetrievalRAG Architecture and Implementationπ’ Free Lesson
Advertisement
Retrieval-Augmented Generation
RAG combines the power of pre-trained language models with external knowledge retrieval, enabling models to generate answers grounded in factual documents.
RAG Pipeline
Stage
Component
Purpose
Indexing
Chunker + Embedder
Prepare documents for retrieval
Retrieval
Vector search
Find relevant documents
Augmentation
Prompt construction
Combine query + context
Generation
LLM
Generate answer
Document Chunking Strategies
from langchain.text_splitter import (
RecursiveCharacterTextSplitter,
CharacterTextSplitter,
)
from sentence_transformers import SentenceTransformer
# Strategy 1: Fixed-size chunks
def fixed_size_chunks(text, chunk_size=512, overlap=50):
splitter = CharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=overlap,
separator="\n"
)
return splitter.split_text(text)
# Strategy 2: Recursive character splitting
def recursive_chunks(text, chunk_size=1000, overlap=200):
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=overlap,
separators=["\n\n", "\n", ". ", " ", ""]
)
return splitter.split_text(text)
# Strategy 3: Semantic chunking
def semantic_chunks(text, model_name='all-MiniLM-L6-v2'):
model = SentenceTransformer(model_name)
sentences = text.split('. ')
embeddings = model.encode(sentences)
similarities = []
for i in range(len(sentences) - 1):
sim = cosine_similarity(embeddings[i], embeddings[i+1])
similarities.append(sim)
# Split at low similarity points
threshold = np.mean(similarities) - np.std(similarities)
chunks = []
current_chunk = [sentences[0]]
for i, sim in enumerate(similarities):
if sim < threshold:
chunks.append('. '.join(current_chunk))
current_chunk = [sentences[i+1]]
else:
current_chunk.append(sentences[i+1])
chunks.append('. '.join(current_chunk))
return chunks
# Strategy 4: Document-aware chunking
def document_aware_chunks(text, metadata):
"""Chunk respecting document structure"""
chunks = []
current_section = ""
for line in text.split('\n'):
if line.startswith('#'):
if current_section:
chunks.append({
"text": current_section,
"section": current_section.split('\n')[0]
})
current_section = line + '\n'
else:
current_section += line + '\n'
if current_section:
chunks.append({"text": current_section, "section": "final"})
return chunks
Complete RAG Implementation
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
class RAGSystem:
def __init__(self, embedding_model="all-MiniLM-L6-v2"):
self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
self.vectorstore = None
self.retriever = None
def index_documents(self, documents, chunk_size=500, chunk_overlap=50):
"""Index documents into vector store"""
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
chunks = splitter.split_documents(documents)
self.vectorstore = FAISS.from_documents(chunks, self.embeddings)
self.retriever = self.vectorstore.as_retriever(
search_type="similarity",
search_kwargs={"k": 4}
)
print(f"Indexed {len(chunks)} chunks from {len(documents)} documents")
def retrieve(self, query, k=4):
"""Retrieve relevant documents"""
return self.retriever.get_relevant_documents(query)
def generate(self, query, context_docs):
"""Generate answer with retrieved context"""
context = "\n\n".join([doc.page_content for doc in context_docs])
prompt = f"""Answer the question based on the context below.
Context: {context}
Question: {query}
Answer:"""
return prompt
def query(self, question):
"""Full RAG pipeline"""
# Retrieve
docs = self.retrieve(question)
# Augment
prompt = self.generate(question, docs)
# Generate
return {
"question": question,
"context": [d.page_content for d in docs],
"prompt": prompt
}
# Usage
rag = RAGSystem()
# Load documents
from langchain.document_loaders import TextLoader
loader = TextLoader("knowledge_base.txt")
documents = loader.load()
# Index
rag.index_documents(documents)
# Query
result = rag.query("What is machine learning?")
print(f"Retrieved {len(result['context'])} documents")
Advanced RAG Techniques
Technique
Description
Benefit
Hybrid search
Combine sparse + dense
Better recall
Reranking
Cross-encoder reranker
Better precision
Query expansion
Generate sub-queries
Better coverage
HyDE
Hypothetical document embedding
Better matching
Self-RAG
Model decides when to retrieve
Adaptive
# HyDE: Hypothetical Document Embeddings
def hyde_retrieval(query, llm, retriever):
# Generate hypothetical answer
hypothetical = llm(f"Write a detailed answer to: {query}")
# Use hypothetical to retrieve
docs = retriever.get_relevant_documents(hypothetical)
return docs
# Query expansion
def query_expansion(query, llm, num_queries=3):
prompt = f"""Generate {num_queries} different versions of this query:
{query}
Return each on a new line."""
expanded = llm(prompt)
queries = [q.strip() for q in expanded.split('\n') if q.strip()]
# Retrieve for each and combine
all_docs = []
for q in queries:
docs = retriever.get_relevant_documents(q)
all_docs.extend(docs)
# Deduplicate
seen = set()
unique_docs = []
for doc in all_docs:
if doc.page_content not in seen:
seen.add(doc.page_content)
unique_docs.append(doc)
return unique_docs[:10]
Evaluation Metrics
Metric
Measures
Range
Context Precision
Relevant docs in top-k
0-1
Context Recall
Coverage of relevant docs
0-1
Faithfulness
Answer grounded in context
0-1
Answer Relevancy
Answer addresses query
0-1
DfFaithfulness Score
RAG reduces hallucination by grounding responses in retrieved documents. However, it can still generate incorrect answers if the retrieved context is insufficient or misleading.
RAG Query Processing
β
Premium Content
Retrieval-Augmented Generation
Unlock this lesson and 900+ advanced tutorials with a Premium plan.