LLM Cloud Platforms: AWS, GCP, and Azure AI Services
Cloud platforms provide managed infrastructure for LLM training and inference. Understanding the tradeoffs between AWS, GCP, and Azure enables optimal platform selection for specific workload requirements.
Cloud Platform Pipeline
Cloud Provider Implementations
1. AWS SageMaker for LLMs
from dataclasses import dataclass
from typing import Dict, Optional
@dataclass
class AWSEndpointConfig:
instance_type: str
instance_count: int
model_data: str
container: str
environment: Dict[str, str]
class AWSSageMakerLLM:
def __init__(self, region: str = "us-east-1"):
self.region = region
self.endpoints: Dict[str, AWSEndpointConfig] = {}
def create_endpoint(self, name: str, model_s3_uri: str,
instance_type: str = "ml.g5.12xlarge") -> Dict:
config = AWSEndpointConfig(
instance_type=instance_type,
instance_count=1,
model_data=model_s3_uri,
container="763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi",
environment={
"HF_MODEL_ID": "meta-llama/Llama-2-70b-chat-hf",
"SM_NUM_GPUS": "4"
}
)
self.endpoints[name] = config
return {"status": "creating", "endpoint_name": name}
def get_pricing(self, instance_type: str) -> Dict:
pricing = {
"ml.g5.2xlarge": {"hourly": 1.21, "gpu": "A10G-24GB", "count": 1},
"ml.g5.12xlarge": {"hourly": 7.09, "gpu": "A10G-24GB", "count": 4},
"ml.p4d.24xlarge": {"hourly": 32.77, "gpu": "A100-40GB", "count": 8},
"ml.trn1.32xlarge": {"hourly": 21.50, "gpu": "Trainium", "count": 16}
}
return pricing.get(instance_type, {"hourly": 0, "gpu": "unknown"})
def estimate_monthly_cost(self, instance_type: str, hours_per_day: float = 8) -> Dict:
pricing = self.get_pricing(instance_type)
monthly = pricing["hourly"] * hours_per_day * 30
return {
"instance_type": instance_type,
"hourly_cost": pricing["hourly"],
"monthly_cost": round(monthly, 2),
"annual_cost": round(monthly * 12, 2)
}
2. GCP Vertex AI for LLMs
class GCPVertexAILLM:
def __init__(self, project_id: str, region: str = "us-central1"):
self.project_id = project_id
self.region = region
self.endpoints: Dict[str, Dict] = {}
def deploy_model(self, model_id: str, machine_type: str = "a2-highgpu-8g",
min_replicas: int = 1, max_replicas: int = 4) -> Dict:
endpoint_config = {
"model_id": model_id,
"machine_type": machine_type,
"min_replicas": min_replicas,
"max_replicas": max_replicas,
"accelerator_type": "NVIDIA_TESLA_A100",
"accelerator_count": 8
}
self.endpoints[model_id] = endpoint_config
return {"status": "deploying", "endpoint": model_id}
def get_pricing(self, machine_type: str) -> Dict:
pricing = {
"a2-highgpu-1g": {"hourly": 3.67, "gpu": "A100-40GB", "count": 1},
"a2-highgpu-8g": {"hourly": 29.39, "gpu": "A100-40GB", "count": 8},
"a2-megagpu-16g": {"hourly": 55.74, "gpu": "A100-40GB", "count": 16},
"g2-standard-8": {"hourly": 1.89, "gpu": "L4-24GB", "count": 1}
}
return pricing.get(machine_type, {"hourly": 0})
def estimate_cost(self, machine_type: str, usage_hours: float = 240) -> Dict:
pricing = self.get_pricing(machine_type)
return {
"machine_type": machine_type,
"monthly_cost": round(pricing["hourly"] * usage_hours, 2),
"cost_per_1k_requests": round(pricing["hourly"] * 0.1, 4)
}
3. Azure OpenAI Service
class AzureOpenAILLM:
def __init__(self, subscription_id: str, resource_group: str):
self.subscription_id = subscription_id
self.resource_group = resource_group
self.deployments: Dict[str, Dict] = {}
def deploy_model(self, model_name: str, deployment_name: str,
capacity: int = 10) -> Dict:
deployment = {
"model_name": model_name,
"deployment_name": deployment_name,
"capacity": capacity,
"sku": {"name": "Standard", "capacity": capacity}
}
self.deployments[deployment_name] = deployment
return {"status": "deploying", "deployment": deployment_name}
def get_pricing(self, model: str) -> Dict:
pricing = {
"gpt-4": {"input": 0.03, "output": 0.06},
"gpt-4-turbo": {"input": 0.01, "output": 0.03},
"gpt-35-turbo": {"input": 0.0015, "output": 0.002},
"gpt-4o": {"input": 0.005, "output": 0.015}
}
return pricing.get(model, {"input": 0.001, "output": 0.002})
def estimate_monthly_cost(self, model: str, input_tokens: int,
output_tokens: int) -> Dict:
pricing = self.get_pricing(model)
input_cost = (input_tokens / 1000) * pricing["input"]
output_cost = (output_tokens / 1000) * pricing["output"]
return {
"model": model,
"input_cost_usd": round(input_cost, 4),
"output_cost_usd": round(output_cost, 4),
"total_cost_usd": round(input_cost + output_cost, 4)
}
Key Formulas
Cloud Cost Optimization
Here,
- =On-demand cost baseline
- =Fraction of work on spot instances
- =Spot discount rate
- =Fraction on reserved instances
- =Reserved discount rate
Platform Comparison
| Feature | AWS SageMaker | GCP Vertex AI | Azure OpenAI |
|---|---|---|---|
| GPU Options | A10G, A100, Trainium | A100, L4, TPU | A100 (managed) |
| Managed Serving | Yes (TGI, DLC) | Yes (Triton) | Yes (API) |
| Spot Pricing | Up to 70% off | Up to 80% off | Limited |
| Fine-Tuning | Yes | Yes | Yes (API) |
| Cost (A100/hr) | 3-29 | $0.03-0.06/1K tok |
Best Practices
- Use reserved instances for predictable workloads (30-50% savings)
- Spot instances for training (up to 70% discount)
- Auto-scaling based on queue depth and latency
- Multi-cloud strategy to avoid vendor lock-in
- Cost monitoring with daily budget alerts