LLM Edge Deployment: Mobile, IoT, and Edge Inference
Edge deployment brings LLM capabilities to devices without cloud connectivity. This requires aggressive model compression, efficient runtime engines, and careful memory management for resource-constrained environments.
Edge Deployment Pipeline
Edge Inference Engine
1. Model Optimization for Edge
from dataclasses import dataclass
from typing import List, Dict
from enum import Enum
class TargetPlatform(Enum):
ANDROID = "android"
IOS = "ios"
RASPBERRY_PI = "raspberry_pi"
ESP32 = "esp32"
BROWSER = "browser"
@dataclass
class EdgeProfile:
platform: TargetPlatform
max_model_size_mb: float
max_ram_mb: float
supports_fp16: bool
supports_int8: bool
supports_int4: bool
class EdgeOptimizer:
PLATFORM_PROFILES = {
TargetPlatform.ANDROID: EdgeProfile(
platform=TargetPlatform.ANDROID,
max_model_size_mb=200, max_ram_mb=512,
supports_fp16=True, supports_int8=True, supports_int4=True
),
TargetPlatform.IOS: EdgeProfile(
platform=TargetPlatform.IOS,
max_model_size_mb=150, max_ram_mb=400,
supports_fp16=True, supports_int8=True, supports_int4=False
),
TargetPlatform.RASPBERRY_PI: EdgeProfile(
platform=TargetPlatform.RASPBERRY_PI,
max_model_size_mb=50, max_ram_mb=256,
supports_fp16=True, supports_int8=True, supports_int4=True
),
TargetPlatform.BROWSER: EdgeProfile(
platform=TargetPlatform.BROWSER,
max_model_size_mb=50, max_ram_mb=200,
supports_fp16=True, supports_int8=False, supports_int4=False
),
}
def get_optimization_plan(self, platform: TargetPlatform,
model_size_mb: float) -> Dict:
profile = self.PLATFORM_PROFILES[platform]
plan = {"target": platform.value, "steps": []}
if model_size_mb > profile.max_model_size_mb:
reduction_needed = model_size_mb / profile.max_model_size_mb
if profile.supports_int4 and reduction_needed > 4:
plan["steps"].append({"type": "quantize", "bits": 4})
elif profile.supports_int8:
plan["steps"].append({"type": "quantize", "bits": 8})
plan["steps"].append({"type": "prune", "sparsity": 0.3})
if profile.supports_fp16:
plan["steps"].append({"type": "convert_fp16": True})
plan["steps"].append({"type": "graph_optimize"})
return plan
def estimate_memory(self, params: int, platform: TargetPlatform) -> Dict:
profile = self.PLATFORM_PROFILES[platform]
fp32_gb = params * 4 / (1024**3)
fp16_gb = params * 2 / (1024**3)
int8_gb = params * 1 / (1024**3)
int4_gb = params * 0.5 / (1024**3)
return {
"fp32_gb": round(fp32_gb, 3),
"fp16_gb": round(fp16_gb, 3),
"int8_gb": round(int8_gb, 3),
"int4_gb": round(int4_gb, 3),
"fits_device": fp16_gb <= profile.max_ram_mb / 1024
}
2. On-Device Inference Manager
import time
from dataclasses import dataclass
from typing import Optional, Dict
@dataclass
class InferenceResult:
text: str
tokens_generated: int
latency_ms: float
memory_used_mb: float
tokens_per_second: float
class EdgeInferenceEngine:
def __init__(self, model_path: str, max_tokens: int = 256):
self.model_path = model_path
self.max_tokens = max_tokens
self.model_loaded = False
def load_model(self) -> bool:
self.model_loaded = True
return True
def generate(self, prompt: str, max_new_tokens: int = 100) -> InferenceResult:
if not self.model_loaded:
raise RuntimeError("Model not loaded")
start = time.time()
generated = self._run_inference(prompt, max_new_tokens)
latency = (time.time() - start) * 1000
tokens = len(generated.split())
return InferenceResult(
text=generated,
tokens_generated=tokens,
latency_ms=latency,
memory_used_mb=50.0,
tokens_per_second=tokens / max(latency / 1000, 0.001)
)
def _run_inference(self, prompt: str, max_tokens: int) -> str:
return f"Generated response for: {prompt[:50]}..."
def benchmark(self, prompts: List[str], iterations: int = 10) -> Dict:
latencies = []
tps_values = []
for prompt in prompts:
for _ in range(iterations):
result = self.generate(prompt)
latencies.append(result.latency_ms)
tps_values.append(result.tokens_per_second)
return {
"avg_latency_ms": sum(latencies) / len(latencies),
"p95_latency_ms": sorted(latencies)[int(0.95 * len(latencies))],
"avg_tps": sum(tps_values) / len(tps_values)
}
Key Formulas
Edge Memory Budget
Here,
- =Model memory requirement
- =Total device RAM
- =OS and system overhead fraction
- =Safety margin fraction
On-Device Throughput
Here,
- =Number of tokens generated
- =Inference time in seconds
Platform Comparison
| Platform | Max Model Size | RAM | Quantization | Runtime |
|---|---|---|---|---|
| iPhone 15 | 150MB | 6GB | INT8, FP16 | CoreML |
| Android Flagship | 200MB | 8GB | INT4, INT8, FP16 | TFLite |
| Raspberry Pi 5 | 50MB | 8GB | INT4, INT8 | ONNX Runtime |
| ESP32 | 2MB | 520KB | INT4 | TensorFlow Lite Micro |
| Browser | 50MB | Variable | FP16 | ONNX.js, WebGPU |
Best Practices
- Profile target devices before choosing compression level
- Use INT4 quantization for memory-constrained devices
- Implement progressive loading for large models
- Cache inference results to reduce redundant computation
- Test on actual hardware not just emulators