LLM Agent Operations: Infrastructure and Tool Calling
LLM agents extend language models with tool use, memory, and planning capabilities. Operating these systems in production requires robust infrastructure for tool execution, state management, and error handling.
Agent Architecture
Tool Calling Infrastructure
1. Tool Registry and Executor
import json
import time
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional
from enum import Enum
class ToolStatus(Enum):
AVAILABLE = "available"
UNAVAILABLE = "unavailable"
RATE_LIMITED = "rate_limited"
@dataclass
class ToolDefinition:
name: str
description: str
parameters: Dict[str, Any]
function: Callable
status: ToolStatus = ToolStatus.AVAILABLE
call_count: int = 0
error_count: int = 0
avg_latency_ms: float = 0.0
def to_schema(self) -> Dict:
return {
"type": "function",
"function": {
"name": self.name,
"description": self.description,
"parameters": self.parameters
}
}
class ToolRegistry:
def __init__(self):
self.tools: Dict[str, ToolDefinition] = {}
def register(self, name: str, description: str, parameters: Dict,
function: Callable) -> ToolDefinition:
tool = ToolDefinition(
name=name,
description=description,
parameters=parameters,
function=function
)
self.tools[name] = tool
return tool
def get_available_tools(self) -> List[Dict]:
return [
t.to_schema() for t in self.tools.values()
if t.status == ToolStatus.AVAILABLE
]
def execute(self, tool_name: str, arguments: Dict[str, Any]) -> Dict:
tool = self.tools.get(tool_name)
if not tool:
return {"error": f"Tool '{tool_name}' not found"}
if tool.status != ToolStatus.AVAILABLE:
return {"error": f"Tool '{tool_name}' is {tool.status.value}"}
start = time.time()
try:
result = tool.function(**arguments)
latency = (time.time() - start) * 1000
tool.call_count += 1
tool.avg_latency_ms = (
(tool.avg_latency_ms * (tool.call_count - 1) + latency) / tool.call_count
)
return {"result": result, "latency_ms": latency}
except Exception as e:
tool.error_count += 1
return {"error": str(e)}
def get_stats(self) -> List[Dict]:
return [
{
"name": t.name,
"calls": t.call_count,
"errors": t.error_count,
"avg_latency_ms": round(t.avg_latency_ms, 2),
"status": t.status.value
}
for t in self.tools.values()
]
2. Agent Memory Manager
from dataclasses import dataclass, field
from typing import List, Dict, Optional
from collections import deque
import json
@dataclass
class MemoryEntry:
content: str
memory_type: str
timestamp: float
metadata: Dict = field(default_factory=dict)
importance: float = 0.5
class AgentMemory:
def __init__(self, short_term_size: int = 20):
self.short_term: deque = deque(maxlen=short_term_size)
self.long_term: List[MemoryEntry] = []
self.working: List[MemoryEntry] = []
def add_short_term(self, content: str, metadata: Optional[Dict] = None):
entry = MemoryEntry(
content=content,
memory_type="short_term",
timestamp=time.time(),
metadata=metadata or {}
)
self.short_term.append(entry)
def add_long_term(self, content: str, importance: float = 0.5,
metadata: Optional[Dict] = None):
entry = MemoryEntry(
content=content,
memory_type="long_term",
timestamp=time.time(),
importance=importance,
metadata=metadata or {}
)
self.long_term.append(entry)
def get_context(self, max_tokens: int = 2000) -> str:
parts = []
token_count = 0
for entry in reversed(self.short_term):
tokens = len(entry.content.split())
if token_count + tokens > max_tokens:
break
parts.append(entry.content)
token_count += tokens
important = sorted(self.long_term, key=lambda e: e.importance, reverse=True)[:5]
for entry in important:
tokens = len(entry.content.split())
if token_count + tokens > max_tokens:
break
parts.append(entry.content)
token_count += tokens
return "\n".join(reversed(parts))
def consolidate(self):
if len(self.short_term) > 10:
important_entries = [
e for e in self.short_term if e.importance > 0.7
]
self.long_term.extend(important_entries)
3. ReAct Agent Loop
from typing import List, Dict, Any, Optional
class ReActAgent:
def __init__(self, tool_registry: ToolRegistry, memory: AgentMemory,
llm_caller: Any):
self.tools = tool_registry
self.memory = memory
self.llm = llm_caller
self.max_iterations = 10
def run(self, query: str) -> str:
self.memory.add_short_term(f"User: {query}")
context = self.memory.get_context()
tools_schema = self.tools.get_available_tools()
for i in range(self.max_iterations):
prompt = self._build_prompt(context, tools_schema)
response = self.llm.generate(prompt)
action = self._parse_action(response)
if action["type"] == "final_answer":
self.memory.add_short_term(f"Agent: {action['answer']}")
return action["answer"]
if action["type"] == "tool_call":
result = self.tools.execute(action["tool"], action["arguments"])
self.memory.add_short_term(
f"Tool {action['tool']}: {json.dumps(result)[:200]}"
)
context = self.memory.get_context()
return "I was unable to complete the task within the allowed iterations."
def _build_prompt(self, context: str, tools: List[Dict]) -> str:
tools_desc = "\n".join([
f"- {t['function']['name']}: {t['function']['description']}"
for t in tools
])
return f"""You are a helpful assistant with access to tools.
Context:
{context}
Available Tools:
{tools_desc}
Think step by step. Use a tool if needed, or provide a final answer.
Format: Thought: [reasoning] Action: [tool_name] Input: [arguments]
Or: Thought: [reasoning] Final Answer: [answer]"""
def _parse_action(self, response: str) -> Dict[str, Any]:
if "Final Answer:" in response:
answer = response.split("Final Answer:")[-1].strip()
return {"type": "final_answer", "answer": answer}
if "Action:" in response:
lines = response.split("\n")
tool = None
arguments = {}
for line in lines:
if line.startswith("Action:"):
tool = line.split("Action:")[-1].strip()
if line.startswith("Input:"):
try:
arguments = json.loads(line.split("Input:")[-1].strip())
except json.JSONDecodeError:
arguments = {"input": line.split("Input:")[-1].strip()}
return {"type": "tool_call", "tool": tool, "arguments": arguments}
return {"type": "final_answer", "answer": response}
Key Concepts
| Component | Purpose | Production Concern |
|---|---|---|
| Tool Registry | Manage available tools | Rate limiting, auth |
| Tool Executor | Run tool calls safely | Sandboxing, timeouts |
| Memory Store | Maintain conversation state | Persistence, size limits |
| Planner | Decompose complex tasks | Latency, token cost |
| ReAct Loop | Iterate until complete | Max iterations, cost cap |
Best Practices
- Sandbox tool execution to prevent code injection
- Set token budgets for agent loops to control costs
- Cache tool results where deterministic
- Implement circuit breakers for unreliable external tools
- Log every tool call for debugging and auditing