Beyond Text-Only RAG
Real-world documents contain text, images, tables, charts, and code. Multi-modal RAG retrieves and processes content across these modalities to provide comprehensive context to LLMs.
Document Parsing for Multi-Modal Content
PDF with Images and Tables
import fitz # PyMuPDF
from PIL import Image
import io
class MultiModalPDFParser:
def __init__(self):
self.text_parser = PDFProcessor()
self.image_parser = ImageProcessor()
self.table_parser = TableExtractor()
def parse(self, pdf_path: str) -> list[dict]:
doc = fitz.open(pdf_path)
content_items = []
for page_num, page in enumerate(doc):
# Extract text blocks
text_blocks = page.get_text("dict")["blocks"]
for block in text_blocks:
if block["type"] == 0: # Text
content_items.append({
"type": "text",
"content": block["content"],
"page": page_num + 1,
"bbox": block["bbox"]
})
# Extract images
image_list = page.get_images()
for img_idx, img in enumerate(image_list):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
content_items.append({
"type": "image",
"content": image_bytes,
"page": page_num + 1,
"metadata": {"format": base_image["ext"], "size": len(image_bytes)}
})
# Extract tables (using table detection)
tables = self.table_parser.extract(page)
for table in tables:
content_items.append({
"type": "table",
"content": table,
"page": page_num + 1
})
doc.close()
return content_items
Image Embedding and Retrieval
CLIP for Image-Text Alignment
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
class CLIPEmbedder:
def __init__(self, model_name: str = "openai/clip-vit-base-patch32"):
self.model = CLIPModel.from_pretrained(model_name)
self.processor = CLIPProcessor.from_pretrained(model_name)
def embed_image(self, image: Image.Image) -> torch.Tensor:
inputs = self.processor(images=image, return_tensors="pt")
with torch.no_grad():
features = self.model.get_image_features(**inputs)
return features / features.norm(dim=-1, keepdim=True)
def embed_text(self, text: str) -> torch.Tensor:
inputs = self.processor(text=[text], return_tensors="pt", padding=True)
with torch.no_grad():
features = self.model.get_text_features(**inputs)
return features / features.norm(dim=-1, keepdim=True)
def compute_similarity(self, image: Image.Image, text: str) -> float:
img_emb = self.embed_image(image)
text_emb = self.embed_text(text)
return float(torch.dot(img_emb.flatten(), text_emb.flatten()))
Multi-Modal Vector Store
class MultiModalVectorStore:
def __init__(self, clip_embedder, text_embedder, vector_store):
self.clip = clip_embedder
self.text_embedder = text_embedder
self.store = vector_store
def index_item(self, item: dict):
if item["type"] == "text":
embedding = self.text_embedder.embed([item["content"]])[0]
elif item["type"] == "image":
image = Image.open(io.BytesIO(item["content"]))
embedding = self.clip.embed_image(image).flatten().tolist()
elif item["type"] == "table":
# Convert table to text representation
table_text = self._table_to_text(item["content"])
embedding = self.text_embedder.embed([table_text])[0]
self.store.upsert(vectors=[{
"id": item.get("id", f"item_{hash(str(item))}"),
"values": embedding.tolist(),
"metadata": {**item.get("metadata", {}), "content_type": item["type"]}
}])
def search(self, query: str, modality: str = None, top_k: int = 10) -> list[dict]:
query_embedding = self.text_embedder.embed([query])[0]
filter_dict = {"content_type": {"$eq": modality}} if modality else None
return self.store.search(vector=query_embedding.tolist(), top_k=top_k, filter=filter_dict)
Table Understanding
class TableProcessor:
def __init__(self, llm):
self.llm = llm
def table_to_text(self, table_data: list[list[str]], max_rows: int = 20) -> str:
"""Convert table to descriptive text for embedding."""
if not table_data:
return ""
headers = table_data[0]
rows = table_data[1:max_rows + 1]
text_parts = [f"Table with columns: {', '.join(headers)}"]
for row in rows:
row_desc = "; ".join(f"{h}: {v}" for h, v in zip(headers, row))
text_parts.append(row_desc)
return "\n".join(text_parts)
def answer_from_table(self, question: str, table_data: list[list[str]]) -> str:
"""Use LLM to answer questions about table data."""
table_text = self.table_to_text(table_data)
prompt = f"""Answer this question based on the table data.
Table:
{table_text}
Question: {question}
Answer:"""
return self.llm.generate(prompt)
Chart and Diagram Understanding
class ChartProcessor:
def __init__(self, vision_llm):
self.llm = vision_llm
def describe_chart(self, image: Image.Image) -> str:
prompt = """Describe this chart in detail. Include:
1. Chart type (bar, line, pie, etc.)
2. Axis labels and units
3. Key data points and trends
4. Any notable patterns or outliers"""
return self.llm.generate_with_image(prompt, image)
def extract_data_from_chart(self, image: Image.Image) -> dict:
prompt = """Extract the numerical data from this chart.
Return as JSON with the following format:
{
"title": "Chart title",
"x_axis": "label",
"y_axis": "label",
"data_series": [
{"name": "series_name", "values": [{"x": ..., "y": ...}, ...]}
]
}"""
response = self.llm.generate_with_image(prompt, image)
return json.loads(response)
Multi-Modal Generation
class MultiModalRAGGenerator:
def __init__(self, retriever, llm, image_captioner):
self.retriever = retriever
self.llm = llm
self.captioner = image_captioner
def generate(self, query: str) -> str:
# Retrieve multi-modal context
results = self.retriever.search(query, top_k=10)
context_parts = []
for result in results:
if result["type"] == "text":
context_parts.append(f"[Text]: {result['content']}")
elif result["type"] == "image":
caption = self.captioner.describe(result["content"])
context_parts.append(f"[Image]: {caption}")
elif result["type"] == "table":
context_parts.append(f"[Table]: {result['content']}")
context = "\n\n".join(context_parts)
prompt = f"""Answer the question using the provided context.
The context may contain text, image descriptions, and tables.
Context:
{context}
Question: {query}
Answer:"""
return self.llm.generate(prompt)
Multi-Modal Evaluation
| Metric | What It Measures | Applicable Modalities |
|---|---|---|
| Cross-modal retrieval Recall@K | Correct modality retrieval | All |
| Image caption accuracy | Image understanding quality | Images |
| Table QA accuracy | Table comprehension | Tables |
| Multi-modal faithfulness | Grounding across modalities | All |
Multi-modal RAG enables comprehensive document understanding by retrieving and reasoning across text, images, tables, and other content types within a unified pipeline.