LLM Cost Management

The Cost Challenge

LLM costs can escalate rapidly in production. A single GPT-4 request can cost $0.01-$ 0.10, and at scale this becomes a significant operational expense.

Token Pricing Comparison

| Model | Input $/1M tokens | Output$ /1M tokens | Context Window | |---|---|---|---| | GPT-4o | $2.50 |$ 10.00 | 128K | | GPT-4o-mini | $0.15 |$ 0.60 | 128K | | Claude 3.5 Sonnet | $3.00 |$ 15.00 | 200K | | Claude 3.5 Haiku | $0.25 |$ 1.25 | 200K | | Llama 3.1 70B (API) | $0.90 |$ 0.90 | 128K | | Llama 3.1 8B (API) | $0.10 |$ 0.10 | 128K |

DfCost Per Request

The total cost of an LLM request:

C_{total} = \frac{T_{input}}{1M} \cdot P_{input} + \frac{T_{output}}{1M} \cdot P_{output} + C_{overhead}

Model Routing

Route requests to the cheapest model that can handle the complexity.

from dataclasses import dataclass
from enum import Enum

class QueryComplexity(Enum):
    SIMPLE = "simple"
    MODERATE = "moderate"
    COMPLEX = "complex"

@dataclass
class ModelConfig:
    name: str
    input_cost_per_1m: float
    output_cost_per_1m: float
    max_context: int
    capabilities: list[str]

class ModelRouter:
    def __init__(self, models: list[ModelConfig]):
        self.models = models
        self.complexity_classifier = None

    def classify_complexity(self, query: str) -> QueryComplexity:
        """Classify query complexity to route to appropriate model."""
        simple_patterns = ["hello", "thanks", "yes", "no", "hi"]
        if len(query.split()) < 10 and any(p in query.lower() for p in simple_patterns):
            return QueryComplexity.SIMPLE

        if len(query.split()) > 50 or any(kw in query.lower() for kw in
            ["analyze", "compare", "explain why", "step by step", "code"]):
            return QueryComplexity.COMPLEX

        return QueryComplexity.MODERATE

    def route(self, query: str) -> ModelConfig:
        complexity = self.classify_complexity(query)

        if complexity == QueryComplexity.SIMPLE:
            return self._get_cheapest(capabilities=["basic"])
        elif complexity == QueryComplexity.MODERATE:
            return self._get_cheapest(capabilities=["general"])
        else:
            return self._get_cheapest(capabilities=["reasoning", "code"])

    def _get_cheapest(self, capabilities: list[str]) -> ModelConfig:
        suitable = [
            m for m in self.models
            if all(c in m.capabilities for c in capabilities)
        ]
        return min(suitable, key=lambda m: m.input_cost_per_1m + m.output_cost_per_1m)

Prompt Optimization

Reduce token count while maintaining quality.

class PromptOptimizer:
    def __init__(self):
        self.common_abbreviations = {
            "for example": "e.g.",
            "that is": "i.e.",
            "and so on": "etc.",
            "approximately": "approx",
            "information": "info",
            "application": "app",
            "documentation": "docs",
        }

    def compress(self, prompt: str) -> str:
        """Reduce prompt token count without losing meaning."""
        compressed = prompt

        # Apply abbreviations
        for full, abbr in self.common_abbreviations.items():
            compressed = compressed.replace(full, abbr)

        # Remove redundant whitespace
        compressed = " ".join(compressed.split())

        # Remove filler phrases
        fillers = ["please", "kindly", "I would like you to", "could you"]
        for filler in fillers:
            compressed = compressed.replace(filler, "")

        return compressed.strip()

    def count_tokens(self, text: str, model: str = "gpt-4") -> int:
        """Estimate token count."""
        # Simple estimation: ~4 chars per token for English
        return len(text) // 4

Semantic Caching

Cache similar queries to avoid redundant API calls.

import hashlib
import numpy as np
from typing import Optional

class SemanticCache:
    def __init__(self, embedder, similarity_threshold: float = 0.95):
        self.embedder = embedder
        self.threshold = similarity_threshold
        self.cache = {}  # embedding -> (query, response, cost)

    def get(self, query: str) -> Optional[dict]:
        query_embedding = self.embedder.embed([query])[0]

        best_match = None
        best_score = 0

        for cached_embedding, (cached_query, response, cost) in self.cache.items():
            similarity = np.dot(query_embedding, cached_embedding) / (
                np.linalg.norm(query_embedding) * np.linalg.norm(cached_embedding)
            )
            if similarity > best_score and similarity >= self.threshold:
                best_score = similarity
                best_match = {"response": response, "cached": True, "similarity": similarity}

        return best_match

    def set(self, query: str, response: str, cost: float):
        query_embedding = self.embedder.embed([query])[0]
        self.cache[tuple(query_embedding)] = (query, response, cost)

    def savings_report(self) -> dict:
        total_savings = sum(cost for _, _, cost in self.cache.values())
        return {
            "cached_queries": len(self.cache),
            "estimated_savings": total_savings
        }

Cost Tracking and Budgeting

from dataclasses import dataclass
from datetime import datetime, timedelta

@dataclass
class CostEntry:
    timestamp: datetime
    model: str
    input_tokens: int
    output_tokens: int
    cost: float
    user_id: str
    endpoint: str

class CostTracker:
    def __init__(self, budget_limit: float = 1000.0):
        self.entries: list[CostEntry] = []
        self.budget_limit = budget_limit
        self.daily_budget = budget_limit / 30

    def log(self, model: str, input_tokens: int, output_tokens: int,
            user_id: str, endpoint: str):
        cost = self._calculate_cost(model, input_tokens, output_tokens)
        entry = CostEntry(
            timestamp=datetime.utcnow(),
            model=model,
            input_tokens=input_tokens,
            output_tokens=output_tokens,
            cost=cost,
            user_id=user_id,
            endpoint=endpoint
        )
        self.entries.append(entry)
        self._check_budget()
        return entry

    def _calculate_cost(self, model: str, input_tokens: int, output_tokens: int) -> float:
        pricing = {
            "gpt-4o": {"input": 2.50, "output": 10.00},
            "gpt-4o-mini": {"input": 0.15, "output": 0.60},
            "claude-3.5-sonnet": {"input": 3.00, "output": 15.00},
        }
        if model not in pricing:
            return 0.0
        p = pricing[model]
        return (input_tokens / 1e6 * p["input"]) + (output_tokens / 1e6 * p["output"])

    def _check_budget(self):
        today = datetime.utcnow().date()
        daily_cost = sum(
            e.cost for e in self.entries
            if e.timestamp.date() == today
        )
        if daily_cost > self.daily_budget:
            raise BudgetExceededException(f"Daily budget exceeded: ${daily_cost:.2f}")

    def report(self, period: str = "daily") -> dict:
        if period == "daily":
            return self._daily_report()
        return self._monthly_report()

    def _daily_report(self) -> dict:
        today = datetime.utcnow().date()
        today_entries = [e for e in self.entries if e.timestamp.date() == today]
        return {
            "date": today.isoformat(),
            "total_cost": sum(e.cost for e in today_entries),
            "total_requests": len(today_entries),
            "by_model": self._group_by_model(today_entries),
            "by_endpoint": self._group_by_endpoint(today_entries)
        }

Cost Optimization Strategies

Strategy	Potential Savings	Implementation Effort
Model routing	50-80%	Medium
Semantic caching	30-60%	Low
Prompt compression	10-30%	Low
Batch processing	20-40%	Medium
Self-hosting	40-70%	High
Output length limits	20-50%	Low

Proactive cost management ensures sustainable LLM operations as usage scales.