Why Prompt Management Matters
Prompts are the primary interface between applications and LLMs. Unlike traditional code, prompts are highly sensitive to small wording changes. A prompt management system provides version control, testing, and optimization capabilities for prompt artifacts.
Prompt Versioning System
Every prompt change should be versioned, tagged, and traceable to production outcomes.
import hashlib
import json
from datetime import datetime
from dataclasses import dataclass, field
from typing import Optional
@dataclass
class PromptVersion:
name: str
version: int
template: str
variables: list[str]
model: str
model_params: dict
metadata: dict = field(default_factory=dict)
created_at: str = field(default_factory=lambda: datetime.utcnow().isoformat())
checksum: str = ""
def __post_init__(self):
self.checksum = hashlib.sha256(self.template.encode()).hexdigest()[:12]
class PromptStore:
def __init__(self):
self.prompts: dict[str, list[PromptVersion]] = {}
self.production: dict[str, str] = {} # name -> version
def register(self, name: str, template: str, variables: list[str],
model: str = "gpt-4", model_params: dict = None) -> PromptVersion:
if name not in self.prompts:
self.prompts[name] = []
version = len(self.prompts[name]) + 1
prompt = PromptVersion(
name=name,
version=version,
template=template,
variables=variables,
model=model,
model_params=model_params or {"temperature": 0.7, "max_tokens": 512}
)
self.prompts[name].append(prompt)
return prompt
def get(self, name: str, version: str = "latest") -> PromptVersion:
if name not in self.prompts:
raise ValueError(f"Prompt '{name}' not found")
if version == "latest":
return self.prompts[name][-1]
v = int(version.replace("v", ""))
return next(p for p in self.prompts[name] if p.version == v)
def promote(self, name: str, version: str):
self.production[name] = version
def render(self, name: str, variables: dict) -> str:
prompt = self.get(name, self.production.get(name, "latest"))
return prompt.template.format(**variables)
Prompt Templates
System Prompt Architecture
# Modular prompt construction
class PromptBuilder:
def __init__(self):
self.sections = {}
def add_section(self, name: str, content: str, priority: int = 0):
self.sections[name] = {"content": content, "priority": priority}
def build(self, max_tokens: int = 4096) -> list[dict]:
sorted_sections = sorted(
self.sections.values(),
key=lambda x: -x["priority"]
)
messages = []
total_chars = 0
for section in sorted_sections:
if total_chars + len(section["content"]) > max_tokens * 3:
break
messages.append({"role": "system", "content": section["content"]})
total_chars += len(section["content"])
return messages
# Usage
builder = PromptBuilder()
builder.add_section("role", "You are a senior data engineer.", priority=10)
builder.add_section("constraints", "Always use type hints. Never use global variables.", priority=8)
builder.add_section("examples", "Example: def process(data: pd.DataFrame) -> dict:", priority=5)
builder.add_section("format", "Return JSON with keys: result, confidence", priority=3)
messages = builder.build()
Jinja2 Template Engine
from jinja2 import Template
# Production-grade prompt template
summarization_prompt = Template("""
You are an expert summarizer. Follow these rules:
{% for rule in rules %}
- {{ rule }}
{% endfor %}
Document to summarize:
---
{{ document }}
---
Provide a {{ length }} summary focusing on: {{ focus_area }}
""")
rendered = summarization_prompt.render(
rules=[
"Maintain factual accuracy",
"Use clear, concise language",
"Include key metrics and dates"
],
document=article_text,
length="3-paragraph",
focus_area="financial performance"
)
A/B Testing Prompts
Systematic A/B testing identifies which prompt variants produce the best results in production.
import random
import time
from dataclasses import dataclass
from typing import Callable
@dataclass
class PromptVariant:
name: str
prompt: str
weight: float # Traffic percentage (0-1)
model_params: dict
@dataclass
class ExperimentResult:
variant: str
response: str
latency_ms: float
tokens_used: int
quality_score: float
timestamp: float
class PromptABTest:
def __init__(self, experiment_name: str, variants: list[PromptVariant]):
self.experiment_name = experiment_name
self.variants = variants
self.results: list[ExperimentResult] = []
self._validate_weights()
def _validate_weights(self):
total = sum(v.weight for v in self.variants)
assert abs(total - 1.0) < 1e-6, f"Weights must sum to 1.0, got {total}"
def select_variant(self) -> PromptVariant:
r = random.random()
cumulative = 0
for variant in self.variants:
cumulative += variant.weight
if r <= cumulative:
return variant
return self.variants[-1]
def run(self, query: str, llm_func: Callable, evaluator: Callable) -> ExperimentResult:
variant = self.select_variant()
start_time = time.time()
response = llm_func(variant.prompt.format(query=query), **variant.model_params)
latency_ms = (time.time() - start_time) * 1000
quality_score = evaluator(query, response)
result = ExperimentResult(
variant=variant.name,
response=response,
latency_ms=latency_ms,
tokens_used=len(response.split()),
quality_score=quality_score,
timestamp=time.time()
)
self.results.append(result)
return result
def analyze(self) -> dict:
analysis = {}
for variant in self.variants:
variant_results = [r for r in self.results if r.variant == variant.name]
if variant_results:
analysis[variant.name] = {
"count": len(variant_results),
"avg_quality": sum(r.quality_score for r in variant_results) / len(variant_results),
"avg_latency_ms": sum(r.latency_ms for r in variant_results) / len(variant_results),
}
return analysis
# Setup experiment
experiment = PromptABTest(
experiment_name="summarizer-v2",
variants=[
PromptVariant("concise", "Summarize concisely: {query}", 0.5, {"temperature": 0.3}),
PromptVariant("detailed", "Provide a detailed summary: {query}", 0.5, {"temperature": 0.5}),
]
)
Prompt Testing
Unit Tests for Prompts
import pytest
class TestPromptQuality:
def setup_method(self):
self.store = PromptStore()
self.store.register(
name="summarizer",
template="Summarize in {num_sentences} sentences: {text}",
variables=["num_sentences", "text"]
)
def test_prompt_renders_correctly(self):
result = self.store.render("summarizer", {"num_sentences": "3", "text": "Test article"})
assert "3 sentences" in result
assert "Test article" in result
def test_prompt_length(self):
long_text = "x" * 10000
result = self.store.render("summarizer", {"num_sentences": "1", "text": long_text})
assert len(result) < 15000 # Within token limits
def test_no_unresolved_variables(self):
result = self.store.render("summarizer", {"num_sentences": "1", "text": "test"})
assert "{" not in result or "}" not in result
@pytest.mark.parametrize("text,expected_keywords", [
("The company reported $1M revenue", ["revenue", "financial"]),
("New AI model released", ["AI", "model", "release"]),
])
def test_output_quality(self, text, expected_keywords):
prompt = self.store.render("summarizer", {"num_sentences": "1", "text": text})
response = call_llm(prompt)
assert any(kw.lower() in response.lower() for kw in expected_keywords)
Prompt Monitoring
Track prompt performance metrics in production to detect quality degradation.
import logging
from dataclasses import dataclass
logger = logging.getLogger("prompt_monitor")
@dataclass
class PromptMetrics:
prompt_name: str
version: str
request_count: int
avg_latency_ms: float
avg_tokens: float
error_rate: float
user_satisfaction: float # From feedback loop
class PromptMonitor:
def __init__(self):
self.metrics: dict[str, PromptMetrics] = {}
def record(self, prompt_name: str, version: str, latency_ms: float,
tokens: int, success: bool, satisfaction: float = None):
key = f"{prompt_name}:v{version}"
if key not in self.metrics:
self.metrics[key] = PromptMetrics(
prompt_name=prompt_name, version=version,
request_count=0, avg_latency_ms=0, avg_tokens=0,
error_rate=0, user_satisfaction=0
)
m = self.metrics[key]
m.request_count += 1
m.avg_latency_ms = (m.avg_latency_ms * (m.request_count - 1) + latency_ms) / m.request_count
m.avg_tokens = (m.avg_tokens * (m.request_count - 1) + tokens) / m.request_count
m.error_rate = (m.error_rate * (m.request_count - 1) + (0 if success else 1)) / m.request_count
def alert_on_degradation(self, threshold: float = 0.1):
for key, m in self.metrics.items():
if m.error_rate > threshold:
logger.warning(f"High error rate for {key}: {m.error_rate:.2%}")
if m.avg_latency_ms > 5000:
logger.warning(f"High latency for {key}: {m.avg_latency_ms:.0f}ms")
Prompt Optimization Strategies
| Strategy | Description | When to Use |
|---|---|---|
| Few-shot examples | Include input/output pairs | Task-specific formatting needed |
| Chain-of-thought | Ask for reasoning steps | Complex reasoning tasks |
| Self-consistency | Sample multiple, majority vote | High-stakes decisions |
| Dynamic prompting | Adapt based on input complexity | Variable query difficulty |
| Prompt compression | Remove redundant instructions | Token budget constraints |
Effective prompt management is essential for maintaining quality as LLM applications scale. Treat prompts as code β version them, test them, and monitor them in production.