The Cost Challenge
LLM costs can escalate rapidly in production. A single GPT-4 request can cost 0.10, and at scale this becomes a significant operational expense.
Token Pricing Comparison
| Model | Input /1M tokens | Context Window | |---|---|---|---| | GPT-4o | 10.00 | 128K | | GPT-4o-mini | 0.60 | 128K | | Claude 3.5 Sonnet | 15.00 | 200K | | Claude 3.5 Haiku | 1.25 | 200K | | Llama 3.1 70B (API) | 0.90 | 128K | | Llama 3.1 8B (API) | 0.10 | 128K |
DfCost Per Request
The total cost of an LLM request:
C_{total} = \frac{T_{input}}{1M} \cdot P_{input} + \frac{T_{output}}{1M} \cdot P_{output} + C_{overhead}
Model Routing
Route requests to the cheapest model that can handle the complexity.
from dataclasses import dataclass
from enum import Enum
class QueryComplexity(Enum):
SIMPLE = "simple"
MODERATE = "moderate"
COMPLEX = "complex"
@dataclass
class ModelConfig:
name: str
input_cost_per_1m: float
output_cost_per_1m: float
max_context: int
capabilities: list[str]
class ModelRouter:
def __init__(self, models: list[ModelConfig]):
self.models = models
self.complexity_classifier = None
def classify_complexity(self, query: str) -> QueryComplexity:
"""Classify query complexity to route to appropriate model."""
simple_patterns = ["hello", "thanks", "yes", "no", "hi"]
if len(query.split()) < 10 and any(p in query.lower() for p in simple_patterns):
return QueryComplexity.SIMPLE
if len(query.split()) > 50 or any(kw in query.lower() for kw in
["analyze", "compare", "explain why", "step by step", "code"]):
return QueryComplexity.COMPLEX
return QueryComplexity.MODERATE
def route(self, query: str) -> ModelConfig:
complexity = self.classify_complexity(query)
if complexity == QueryComplexity.SIMPLE:
return self._get_cheapest(capabilities=["basic"])
elif complexity == QueryComplexity.MODERATE:
return self._get_cheapest(capabilities=["general"])
else:
return self._get_cheapest(capabilities=["reasoning", "code"])
def _get_cheapest(self, capabilities: list[str]) -> ModelConfig:
suitable = [
m for m in self.models
if all(c in m.capabilities for c in capabilities)
]
return min(suitable, key=lambda m: m.input_cost_per_1m + m.output_cost_per_1m)
Prompt Optimization
Reduce token count while maintaining quality.
class PromptOptimizer:
def __init__(self):
self.common_abbreviations = {
"for example": "e.g.",
"that is": "i.e.",
"and so on": "etc.",
"approximately": "approx",
"information": "info",
"application": "app",
"documentation": "docs",
}
def compress(self, prompt: str) -> str:
"""Reduce prompt token count without losing meaning."""
compressed = prompt
# Apply abbreviations
for full, abbr in self.common_abbreviations.items():
compressed = compressed.replace(full, abbr)
# Remove redundant whitespace
compressed = " ".join(compressed.split())
# Remove filler phrases
fillers = ["please", "kindly", "I would like you to", "could you"]
for filler in fillers:
compressed = compressed.replace(filler, "")
return compressed.strip()
def count_tokens(self, text: str, model: str = "gpt-4") -> int:
"""Estimate token count."""
# Simple estimation: ~4 chars per token for English
return len(text) // 4
Semantic Caching
Cache similar queries to avoid redundant API calls.
import hashlib
import numpy as np
from typing import Optional
class SemanticCache:
def __init__(self, embedder, similarity_threshold: float = 0.95):
self.embedder = embedder
self.threshold = similarity_threshold
self.cache = {} # embedding -> (query, response, cost)
def get(self, query: str) -> Optional[dict]:
query_embedding = self.embedder.embed([query])[0]
best_match = None
best_score = 0
for cached_embedding, (cached_query, response, cost) in self.cache.items():
similarity = np.dot(query_embedding, cached_embedding) / (
np.linalg.norm(query_embedding) * np.linalg.norm(cached_embedding)
)
if similarity > best_score and similarity >= self.threshold:
best_score = similarity
best_match = {"response": response, "cached": True, "similarity": similarity}
return best_match
def set(self, query: str, response: str, cost: float):
query_embedding = self.embedder.embed([query])[0]
self.cache[tuple(query_embedding)] = (query, response, cost)
def savings_report(self) -> dict:
total_savings = sum(cost for _, _, cost in self.cache.values())
return {
"cached_queries": len(self.cache),
"estimated_savings": total_savings
}
Cost Tracking and Budgeting
from dataclasses import dataclass
from datetime import datetime, timedelta
@dataclass
class CostEntry:
timestamp: datetime
model: str
input_tokens: int
output_tokens: int
cost: float
user_id: str
endpoint: str
class CostTracker:
def __init__(self, budget_limit: float = 1000.0):
self.entries: list[CostEntry] = []
self.budget_limit = budget_limit
self.daily_budget = budget_limit / 30
def log(self, model: str, input_tokens: int, output_tokens: int,
user_id: str, endpoint: str):
cost = self._calculate_cost(model, input_tokens, output_tokens)
entry = CostEntry(
timestamp=datetime.utcnow(),
model=model,
input_tokens=input_tokens,
output_tokens=output_tokens,
cost=cost,
user_id=user_id,
endpoint=endpoint
)
self.entries.append(entry)
self._check_budget()
return entry
def _calculate_cost(self, model: str, input_tokens: int, output_tokens: int) -> float:
pricing = {
"gpt-4o": {"input": 2.50, "output": 10.00},
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
"claude-3.5-sonnet": {"input": 3.00, "output": 15.00},
}
if model not in pricing:
return 0.0
p = pricing[model]
return (input_tokens / 1e6 * p["input"]) + (output_tokens / 1e6 * p["output"])
def _check_budget(self):
today = datetime.utcnow().date()
daily_cost = sum(
e.cost for e in self.entries
if e.timestamp.date() == today
)
if daily_cost > self.daily_budget:
raise BudgetExceededException(f"Daily budget exceeded: ${daily_cost:.2f}")
def report(self, period: str = "daily") -> dict:
if period == "daily":
return self._daily_report()
return self._monthly_report()
def _daily_report(self) -> dict:
today = datetime.utcnow().date()
today_entries = [e for e in self.entries if e.timestamp.date() == today]
return {
"date": today.isoformat(),
"total_cost": sum(e.cost for e in today_entries),
"total_requests": len(today_entries),
"by_model": self._group_by_model(today_entries),
"by_endpoint": self._group_by_endpoint(today_entries)
}
Cost Optimization Strategies
| Strategy | Potential Savings | Implementation Effort |
|---|---|---|
| Model routing | 50-80% | Medium |
| Semantic caching | 30-60% | Low |
| Prompt compression | 10-30% | Low |
| Batch processing | 20-40% | Medium |
| Self-hosting | 40-70% | High |
| Output length limits | 20-50% | Low |
Proactive cost management ensures sustainable LLM operations as usage scales.