Embedding Models in RAG
Embedding models convert text into dense vector representations that capture semantic meaning. The quality of embeddings directly determines retrieval accuracy and downstream RAG performance.
Model Comparison
| Model | Dimensions | Max Tokens | Speed | Quality (MTEB) | Cost |
|---|---|---|---|---|---|
| text-embedding-3-large | 3072 | 8191 | Fast | 64.6 | $0.13/M tokens |
| text-embedding-3-small | 1536 | 8191 | Fast | 62.3 | $0.02/M tokens |
| embed-english-v3.0 | 1024 | 512 | Fast | 64.6 | $0.10/M tokens |
| all-MiniLM-L6-v2 | 384 | 256 | Very Fast | 56.3 | Free (self-host) |
| bge-large-en-v1.5 | 1024 | 512 | Medium | 64.2 | Free (self-host) |
| e5-large-v2 | 1024 | 512 | Medium | 62.2 | Free (self-host) |
OpenAI Embeddings
from openai import OpenAI
import numpy as np
client = OpenAI(api_key="YOUR_API_KEY")
class OpenAIEmbedder:
def __init__(self, model: str = "text-embedding-3-small"):
self.model = model
self.client = OpenAI()
def embed(self, texts: list[str], batch_size: int = 100) -> np.ndarray:
all_embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
response = self.client.embeddings.create(
model=self.model,
input=batch,
dimensions=1536 # Optional: reduce dimensions
)
embeddings = [item.embedding for item in response.data]
all_embeddings.extend(embeddings)
return np.array(all_embeddings)
def embed_query(self, query: str) -> np.ndarray:
response = self.client.embeddings.create(
model=self.model,
input=[query]
)
return np.array(response.data[0].embedding)
# Usage
embedder = OpenAIEmbedder()
doc_embeddings = embedder.embed(["Document 1", "Document 2"])
query_embedding = embedder.embed_query("Search query")
Self-Hosted Embeddings with sentence-transformers
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
class LocalEmbedder:
def __init__(self, model_name: str = "BAAI/bge-large-en-v1.5"):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model = SentenceTransformer(model_name, device=self.device)
self.dimension = self.model.get_sentence_embedding_dimension()
def embed(self, texts: list[str], batch_size: int = 64,
show_progress: bool = True) -> np.ndarray:
return self.model.encode(
texts,
batch_size=batch_size,
show_progress_bar=show_progress,
normalize_embeddings=True,
device=self.device
)
def embed_with_metadata(self, documents: list[dict]) -> list[dict]:
texts = [doc["content"] for doc in documents]
embeddings = self.embed(texts)
return [
{**doc, "embedding": emb.tolist()}
for doc, emb in zip(documents, embeddings)
]
# GPU-accelerated embedding
embedder = LocalEmbedder("BAAI/bge-large-en-v1.5")
embeddings = embedder.embed(large_document_list, batch_size=256)
Embedding Dimension Reduction
Reducing dimensions saves storage and speeds up search with minimal quality loss.
DfDimension Reduction Trade-off
The relationship between dimension d and retrieval quality:
\text{Quality}(d) \approx Q_{max} \cdot (1 - e^{-\alpha \cdot d})
Where Ξ± is a model-specific constant. Most models retain >95% quality at 50% dimensions.
# OpenAI dimension reduction (built-in)
response = client.embeddings.create(
model="text-embedding-3-large",
input=["text"],
dimensions=256 # Reduced from 3072
)
# Manual PCA reduction for self-hosted models
from sklearn.decomposition import PCA
class DimensionReducer:
def __init__(self, target_dim: int = 256):
self.target_dim = target_dim
self.pca = None
def fit(self, embeddings: np.ndarray):
self.pca = PCA(n_components=self.target_dim)
self.pca.fit(embeddings)
def transform(self, embeddings: np.ndarray) -> np.ndarray:
if self.pca is None:
raise ValueError("Must call fit() first")
return self.pca.transform(embeddings)
Matryoshka Embeddings
Modern embedding models support variable-dimension output, allowing trade-off between accuracy and efficiency.
# Matryoshka dimension selection
class MatryoshkaEmbedder:
def __init__(self, model_name: str = "text-embedding-3-large"):
self.model = model_name
self.client = OpenAI()
def embed(self, texts: list[str], dimensions: int = 256) -> np.ndarray:
response = self.client.embeddings.create(
model=self.model,
input=texts,
dimensions=dimensions
)
return np.array([item.embedding for item in response.data])
def search_at_multiple_dims(self, query: str, documents: list[str],
dims: list[int] = [256, 1024, 3072]) -> dict:
results = {}
for d in dims:
query_emb = self.embed([query], dimensions=d)[0]
doc_embs = self.embed(documents, dimensions=d)
similarities = np.dot(doc_embs, query_emb) / (
np.linalg.norm(doc_embs, axis=1) * np.linalg.norm(query_emb)
)
results[d] = np.argsort(similarities)[::-1].tolist()
return results
Embedding Evaluation
| Benchmark | Metric | What It Measures |
|---|---|---|
| MTEB | NDCG@10 | Retrieval quality across tasks |
| BEIR | MAP | Zero-shot retrieval performance |
| STS-B | Pearson/Spearman | Semantic similarity accuracy |
| LCC | Kendall Ο | Long-context coherence |
\text{NDCG@K} = \frac{\text{DCG@K}}{\text{IDCG@K}} = \frac{\sum_{i=1}^{K} \frac{2^{rel_i} - 1}{\log_2(i+1)}}{\sum_{i=1}^{K} \frac{2^{rel_i^*} - 1}{\log_2(i+1)}}
Selecting the right embedding model balances quality, latency, cost, and deployment constraints. For most production RAG systems, API-based models (OpenAI, Cohere) offer the best quality-to-ops ratio, while self-hosted models provide cost savings at scale.