LLM Infrastructure: GPU Clusters, Networking, and Storage
LLM infrastructure requires specialized hardware configurations including GPU clusters with high-bandwidth interconnects, distributed storage for training data, and network topologies optimized for collective communication patterns.
Infrastructure Architecture
Cluster Configuration
1. GPU Cluster Manager
from dataclasses import dataclass, field
from typing import List, Dict, Optional
from enum import Enum
class GPUType(Enum):
A100_40GB = "A100-40GB"
A100_80GB = "A100-80GB"
H100_80GB = "H100-80GB"
H100_NVL = "H100-NVL"
@dataclass
class GPUNode:
node_id: str
gpu_type: GPUType
gpu_count: int
gpu_memory_gb: float
cpu_cores: int
ram_gb: float
network_bandwidth_gbps: float
interconnect: str = "NVLink"
@property
def total_gpu_memory(self) -> float:
return self.gpu_count * self.gpu_memory_gb
@dataclass
class ClusterConfig:
name: str
nodes: List[GPUNode] = field(default_factory=list)
network_fabric: str = "InfiniBand HDR"
storage_type: str = "Lustre"
scheduler: str = "Slurm"
@property
def total_gpus(self) -> int:
return sum(n.gpu_count for n in self.nodes)
@property
def total_memory_tb(self) -> float:
return sum(n.total_gpu_memory for n in self.nodes) / 1000
class ClusterManager:
def __init__(self, config: ClusterConfig):
self.config = config
self.allocations: Dict[str, List[str]] = {}
def allocate_nodes(self, job_id: str, num_nodes: int) -> List[str]:
available = [n.node_id for n in self.config.nodes
if n.node_id not in self.allocations]
if len(available) < num_nodes:
return []
allocated = available[:num_nodes]
self.allocations[job_id] = allocated
return allocated
def get_cluster_stats(self) -> Dict:
total_gpus = sum(n.gpu_count for n in self.config.nodes)
allocated_gpus = sum(
len(self.allocations[job]) * self.config.nodes[0].gpu_count
for job in self.allocations
)
return {
"total_nodes": len(self.config.nodes),
"total_gpus": total_gpus,
"allocated_gpus": allocated_gpus,
"available_gpus": total_gpus - allocated_gpus,
"utilization": allocated_gpus / max(total_gpus, 1)
}
def estimate_training_cost(self, model_params_b: float, epochs: int) -> Dict:
gpu_hours = model_params_b * epochs * 10
cost_per_gpu_hour = 2.50
total_cost = gpu_hours * cost_per_gpu_hour
return {
"gpu_hours": gpu_hours,
"cost_per_gpu_hour": cost_per_gpu_hour,
"total_cost_usd": total_cost,
"estimated_days": gpu_hours / (self.config.total_gpus * 24)
}
2. Network Topology
@dataclass
class NetworkLink:
source: str
destination: str
bandwidth_gbps: float
latency_us: float
class NetworkTopology:
def __init__(self):
self.links: List[NetworkLink] = []
def add_link(self, source: str, dest: str, bandwidth: float, latency: float):
self.links.append(NetworkLink(source, dest, bandwidth, latency))
def compute_bisection_bandwidth(self, num_nodes: int) -> float:
return num_nodes * 400 / 2
def estimate_allreduce_time(self, message_size_gb: float,
num_gpus: int) -> float:
bandwidth = 400
latency_us = 1
n = num_gpus
return (message_size_gb * 1000 * (2 * (n - 1)) / (n * bandwidth)) + latency_us * (n - 1) / 1000
Key Formulas
AllReduce Latency (Ring)
Here,
- =Number of GPUs
- =Message size in GB
- =Network bandwidth in GB/s
- =Per-hop latency
Model Parallel Efficiency
Here,
- =Time on single GPU
- =Time on N GPUs
- =Number of GPUs used
Hardware Specifications
| Component | Specification | Purpose | Cost Range |
|---|---|---|---|
| NVIDIA H100 | 80GB HBM3, 3.35TB/s | Training/Inference | $25,000-30,000 |
| NVIDIA A100 | 80GB HBM2e, 2TB/s | Training | $10,000-15,000 |
| InfiniBand HDR | 400Gbps per port | Inter-GPU communication | $500-1,000/port |
| NVLink 4.0 | 900GB/s | Intra-node GPU link | Built into GPU |
| Lustre Storage | 100+ GB/s | Parallel file system | $0.10-0.20/GB/month |
Best Practices
- Use NVLink for intra-node communication (8 GPUs per node)
- InfiniBand HDR for inter-node with fat-tree topology
- Parallel storage (Lustre/GPFS) for checkpoint and data loading
- GPU utilization monitoring to identify idle resources
- Auto-scaling for cloud-based clusters to optimize cost