Multi-Modal RAG

Beyond Text-Only RAG

Real-world documents contain text, images, tables, charts, and code. Multi-modal RAG retrieves and processes content across these modalities to provide comprehensive context to LLMs.

Document Parsing for Multi-Modal Content

PDF with Images and Tables

import fitz  # PyMuPDF
from PIL import Image
import io

class MultiModalPDFParser:
    def __init__(self):
        self.text_parser = PDFProcessor()
        self.image_parser = ImageProcessor()
        self.table_parser = TableExtractor()

    def parse(self, pdf_path: str) -> list[dict]:
        doc = fitz.open(pdf_path)
        content_items = []

        for page_num, page in enumerate(doc):
            # Extract text blocks
            text_blocks = page.get_text("dict")["blocks"]
            for block in text_blocks:
                if block["type"] == 0:  # Text
                    content_items.append({
                        "type": "text",
                        "content": block["content"],
                        "page": page_num + 1,
                        "bbox": block["bbox"]
                    })

            # Extract images
            image_list = page.get_images()
            for img_idx, img in enumerate(image_list):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                content_items.append({
                    "type": "image",
                    "content": image_bytes,
                    "page": page_num + 1,
                    "metadata": {"format": base_image["ext"], "size": len(image_bytes)}
                })

            # Extract tables (using table detection)
            tables = self.table_parser.extract(page)
            for table in tables:
                content_items.append({
                    "type": "table",
                    "content": table,
                    "page": page_num + 1
                })

        doc.close()
        return content_items

Image Embedding and Retrieval

CLIP for Image-Text Alignment

import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

class CLIPEmbedder:
    def __init__(self, model_name: str = "openai/clip-vit-base-patch32"):
        self.model = CLIPModel.from_pretrained(model_name)
        self.processor = CLIPProcessor.from_pretrained(model_name)

    def embed_image(self, image: Image.Image) -> torch.Tensor:
        inputs = self.processor(images=image, return_tensors="pt")
        with torch.no_grad():
            features = self.model.get_image_features(**inputs)
        return features / features.norm(dim=-1, keepdim=True)

    def embed_text(self, text: str) -> torch.Tensor:
        inputs = self.processor(text=[text], return_tensors="pt", padding=True)
        with torch.no_grad():
            features = self.model.get_text_features(**inputs)
        return features / features.norm(dim=-1, keepdim=True)

    def compute_similarity(self, image: Image.Image, text: str) -> float:
        img_emb = self.embed_image(image)
        text_emb = self.embed_text(text)
        return float(torch.dot(img_emb.flatten(), text_emb.flatten()))

Multi-Modal Vector Store

class MultiModalVectorStore:
    def __init__(self, clip_embedder, text_embedder, vector_store):
        self.clip = clip_embedder
        self.text_embedder = text_embedder
        self.store = vector_store

    def index_item(self, item: dict):
        if item["type"] == "text":
            embedding = self.text_embedder.embed([item["content"]])[0]
        elif item["type"] == "image":
            image = Image.open(io.BytesIO(item["content"]))
            embedding = self.clip.embed_image(image).flatten().tolist()
        elif item["type"] == "table":
            # Convert table to text representation
            table_text = self._table_to_text(item["content"])
            embedding = self.text_embedder.embed([table_text])[0]

        self.store.upsert(vectors=[{
            "id": item.get("id", f"item_{hash(str(item))}"),
            "values": embedding.tolist(),
            "metadata": {**item.get("metadata", {}), "content_type": item["type"]}
        }])

    def search(self, query: str, modality: str = None, top_k: int = 10) -> list[dict]:
        query_embedding = self.text_embedder.embed([query])[0]
        filter_dict = {"content_type": {"$eq": modality}} if modality else None
        return self.store.search(vector=query_embedding.tolist(), top_k=top_k, filter=filter_dict)

Table Understanding

class TableProcessor:
    def __init__(self, llm):
        self.llm = llm

    def table_to_text(self, table_data: list[list[str]], max_rows: int = 20) -> str:
        """Convert table to descriptive text for embedding."""
        if not table_data:
            return ""

        headers = table_data[0]
        rows = table_data[1:max_rows + 1]

        text_parts = [f"Table with columns: {', '.join(headers)}"]
        for row in rows:
            row_desc = "; ".join(f"{h}: {v}" for h, v in zip(headers, row))
            text_parts.append(row_desc)

        return "\n".join(text_parts)

    def answer_from_table(self, question: str, table_data: list[list[str]]) -> str:
        """Use LLM to answer questions about table data."""
        table_text = self.table_to_text(table_data)
        prompt = f"""Answer this question based on the table data.

        Table:
        {table_text}

        Question: {question}
        Answer:"""

        return self.llm.generate(prompt)

Chart and Diagram Understanding

class ChartProcessor:
    def __init__(self, vision_llm):
        self.llm = vision_llm

    def describe_chart(self, image: Image.Image) -> str:
        prompt = """Describe this chart in detail. Include:
        1. Chart type (bar, line, pie, etc.)
        2. Axis labels and units
        3. Key data points and trends
        4. Any notable patterns or outliers"""

        return self.llm.generate_with_image(prompt, image)

    def extract_data_from_chart(self, image: Image.Image) -> dict:
        prompt = """Extract the numerical data from this chart.
        Return as JSON with the following format:
        {
            "title": "Chart title",
            "x_axis": "label",
            "y_axis": "label",
            "data_series": [
                {"name": "series_name", "values": [{"x": ..., "y": ...}, ...]}
            ]
        }"""

        response = self.llm.generate_with_image(prompt, image)
        return json.loads(response)

Multi-Modal Generation

class MultiModalRAGGenerator:
    def __init__(self, retriever, llm, image_captioner):
        self.retriever = retriever
        self.llm = llm
        self.captioner = image_captioner

    def generate(self, query: str) -> str:
        # Retrieve multi-modal context
        results = self.retriever.search(query, top_k=10)

        context_parts = []
        for result in results:
            if result["type"] == "text":
                context_parts.append(f"[Text]: {result['content']}")
            elif result["type"] == "image":
                caption = self.captioner.describe(result["content"])
                context_parts.append(f"[Image]: {caption}")
            elif result["type"] == "table":
                context_parts.append(f"[Table]: {result['content']}")

        context = "\n\n".join(context_parts)

        prompt = f"""Answer the question using the provided context.
        The context may contain text, image descriptions, and tables.

        Context:
        {context}

        Question: {query}
        Answer:"""

        return self.llm.generate(prompt)

Multi-Modal Evaluation

Metric	What It Measures	Applicable Modalities
Cross-modal retrieval Recall@K	Correct modality retrieval	All
Image caption accuracy	Image understanding quality	Images
Table QA accuracy	Table comprehension	Tables
Multi-modal faithfulness	Grounding across modalities	All

Multi-modal RAG enables comprehensive document understanding by retrieving and reasoning across text, images, tables, and other content types within a unified pipeline.