Prompt Management

Why Prompt Management Matters

Prompts are the primary interface between applications and LLMs. Unlike traditional code, prompts are highly sensitive to small wording changes. A prompt management system provides version control, testing, and optimization capabilities for prompt artifacts.

Prompt Versioning System

Every prompt change should be versioned, tagged, and traceable to production outcomes.

import hashlib
import json
from datetime import datetime
from dataclasses import dataclass, field
from typing import Optional

@dataclass
class PromptVersion:
    name: str
    version: int
    template: str
    variables: list[str]
    model: str
    model_params: dict
    metadata: dict = field(default_factory=dict)
    created_at: str = field(default_factory=lambda: datetime.utcnow().isoformat())
    checksum: str = ""

    def __post_init__(self):
        self.checksum = hashlib.sha256(self.template.encode()).hexdigest()[:12]

class PromptStore:
    def __init__(self):
        self.prompts: dict[str, list[PromptVersion]] = {}
        self.production: dict[str, str] = {}  # name -> version

    def register(self, name: str, template: str, variables: list[str],
                 model: str = "gpt-4", model_params: dict = None) -> PromptVersion:
        if name not in self.prompts:
            self.prompts[name] = []

        version = len(self.prompts[name]) + 1
        prompt = PromptVersion(
            name=name,
            version=version,
            template=template,
            variables=variables,
            model=model,
            model_params=model_params or {"temperature": 0.7, "max_tokens": 512}
        )
        self.prompts[name].append(prompt)
        return prompt

    def get(self, name: str, version: str = "latest") -> PromptVersion:
        if name not in self.prompts:
            raise ValueError(f"Prompt '{name}' not found")
        if version == "latest":
            return self.prompts[name][-1]
        v = int(version.replace("v", ""))
        return next(p for p in self.prompts[name] if p.version == v)

    def promote(self, name: str, version: str):
        self.production[name] = version

    def render(self, name: str, variables: dict) -> str:
        prompt = self.get(name, self.production.get(name, "latest"))
        return prompt.template.format(**variables)

Prompt Templates

System Prompt Architecture

# Modular prompt construction
class PromptBuilder:
    def __init__(self):
        self.sections = {}

    def add_section(self, name: str, content: str, priority: int = 0):
        self.sections[name] = {"content": content, "priority": priority}

    def build(self, max_tokens: int = 4096) -> list[dict]:
        sorted_sections = sorted(
            self.sections.values(),
            key=lambda x: -x["priority"]
        )
        messages = []
        total_chars = 0
        for section in sorted_sections:
            if total_chars + len(section["content"]) > max_tokens * 3:
                break
            messages.append({"role": "system", "content": section["content"]})
            total_chars += len(section["content"])
        return messages

# Usage
builder = PromptBuilder()
builder.add_section("role", "You are a senior data engineer.", priority=10)
builder.add_section("constraints", "Always use type hints. Never use global variables.", priority=8)
builder.add_section("examples", "Example: def process(data: pd.DataFrame) -> dict:", priority=5)
builder.add_section("format", "Return JSON with keys: result, confidence", priority=3)

messages = builder.build()

Jinja2 Template Engine

from jinja2 import Template

# Production-grade prompt template
summarization_prompt = Template("""
You are an expert summarizer. Follow these rules:
{% for rule in rules %}
- {{ rule }}
{% endfor %}

Document to summarize:
---
{{ document }}
---

Provide a {{ length }} summary focusing on: {{ focus_area }}
""")

rendered = summarization_prompt.render(
    rules=[
        "Maintain factual accuracy",
        "Use clear, concise language",
        "Include key metrics and dates"
    ],
    document=article_text,
    length="3-paragraph",
    focus_area="financial performance"
)

A/B Testing Prompts

Systematic A/B testing identifies which prompt variants produce the best results in production.

import random
import time
from dataclasses import dataclass
from typing import Callable

@dataclass
class PromptVariant:
    name: str
    prompt: str
    weight: float  # Traffic percentage (0-1)
    model_params: dict

@dataclass
class ExperimentResult:
    variant: str
    response: str
    latency_ms: float
    tokens_used: int
    quality_score: float
    timestamp: float

class PromptABTest:
    def __init__(self, experiment_name: str, variants: list[PromptVariant]):
        self.experiment_name = experiment_name
        self.variants = variants
        self.results: list[ExperimentResult] = []
        self._validate_weights()

    def _validate_weights(self):
        total = sum(v.weight for v in self.variants)
        assert abs(total - 1.0) < 1e-6, f"Weights must sum to 1.0, got {total}"

    def select_variant(self) -> PromptVariant:
        r = random.random()
        cumulative = 0
        for variant in self.variants:
            cumulative += variant.weight
            if r <= cumulative:
                return variant
        return self.variants[-1]

    def run(self, query: str, llm_func: Callable, evaluator: Callable) -> ExperimentResult:
        variant = self.select_variant()

        start_time = time.time()
        response = llm_func(variant.prompt.format(query=query), **variant.model_params)
        latency_ms = (time.time() - start_time) * 1000

        quality_score = evaluator(query, response)

        result = ExperimentResult(
            variant=variant.name,
            response=response,
            latency_ms=latency_ms,
            tokens_used=len(response.split()),
            quality_score=quality_score,
            timestamp=time.time()
        )
        self.results.append(result)
        return result

    def analyze(self) -> dict:
        analysis = {}
        for variant in self.variants:
            variant_results = [r for r in self.results if r.variant == variant.name]
            if variant_results:
                analysis[variant.name] = {
                    "count": len(variant_results),
                    "avg_quality": sum(r.quality_score for r in variant_results) / len(variant_results),
                    "avg_latency_ms": sum(r.latency_ms for r in variant_results) / len(variant_results),
                }
        return analysis

# Setup experiment
experiment = PromptABTest(
    experiment_name="summarizer-v2",
    variants=[
        PromptVariant("concise", "Summarize concisely: {query}", 0.5, {"temperature": 0.3}),
        PromptVariant("detailed", "Provide a detailed summary: {query}", 0.5, {"temperature": 0.5}),
    ]
)

Prompt Testing

Unit Tests for Prompts

import pytest

class TestPromptQuality:
    def setup_method(self):
        self.store = PromptStore()
        self.store.register(
            name="summarizer",
            template="Summarize in {num_sentences} sentences: {text}",
            variables=["num_sentences", "text"]
        )

    def test_prompt_renders_correctly(self):
        result = self.store.render("summarizer", {"num_sentences": "3", "text": "Test article"})
        assert "3 sentences" in result
        assert "Test article" in result

    def test_prompt_length(self):
        long_text = "x" * 10000
        result = self.store.render("summarizer", {"num_sentences": "1", "text": long_text})
        assert len(result) < 15000  # Within token limits

    def test_no_unresolved_variables(self):
        result = self.store.render("summarizer", {"num_sentences": "1", "text": "test"})
        assert "{" not in result or "}" not in result

    @pytest.mark.parametrize("text,expected_keywords", [
        ("The company reported $1M revenue", ["revenue", "financial"]),
        ("New AI model released", ["AI", "model", "release"]),
    ])
    def test_output_quality(self, text, expected_keywords):
        prompt = self.store.render("summarizer", {"num_sentences": "1", "text": text})
        response = call_llm(prompt)
        assert any(kw.lower() in response.lower() for kw in expected_keywords)

Prompt Monitoring

Track prompt performance metrics in production to detect quality degradation.

import logging
from dataclasses import dataclass

logger = logging.getLogger("prompt_monitor")

@dataclass
class PromptMetrics:
    prompt_name: str
    version: str
    request_count: int
    avg_latency_ms: float
    avg_tokens: float
    error_rate: float
    user_satisfaction: float  # From feedback loop

class PromptMonitor:
    def __init__(self):
        self.metrics: dict[str, PromptMetrics] = {}

    def record(self, prompt_name: str, version: str, latency_ms: float,
               tokens: int, success: bool, satisfaction: float = None):
        key = f"{prompt_name}:v{version}"
        if key not in self.metrics:
            self.metrics[key] = PromptMetrics(
                prompt_name=prompt_name, version=version,
                request_count=0, avg_latency_ms=0, avg_tokens=0,
                error_rate=0, user_satisfaction=0
            )
        m = self.metrics[key]
        m.request_count += 1
        m.avg_latency_ms = (m.avg_latency_ms * (m.request_count - 1) + latency_ms) / m.request_count
        m.avg_tokens = (m.avg_tokens * (m.request_count - 1) + tokens) / m.request_count
        m.error_rate = (m.error_rate * (m.request_count - 1) + (0 if success else 1)) / m.request_count

    def alert_on_degradation(self, threshold: float = 0.1):
        for key, m in self.metrics.items():
            if m.error_rate > threshold:
                logger.warning(f"High error rate for {key}: {m.error_rate:.2%}")
            if m.avg_latency_ms > 5000:
                logger.warning(f"High latency for {key}: {m.avg_latency_ms:.0f}ms")

Prompt Optimization Strategies

Strategy	Description	When to Use
Few-shot examples	Include input/output pairs	Task-specific formatting needed
Chain-of-thought	Ask for reasoning steps	Complex reasoning tasks
Self-consistency	Sample multiple, majority vote	High-stakes decisions
Dynamic prompting	Adapt based on input complexity	Variable query difficulty
Prompt compression	Remove redundant instructions	Token budget constraints

Effective prompt management is essential for maintaining quality as LLM applications scale. Treat prompts as code — version them, test them, and monitor them in production.