LLM Edge Deployment: Mobile, IoT, and Edge Inference

Edge deployment brings LLM capabilities to devices without cloud connectivity. This requires aggressive model compression, efficient runtime engines, and careful memory management for resource-constrained environments.

Edge Deployment Pipeline

Edge Inference Engine

1. Model Optimization for Edge

from dataclasses import dataclass
from typing import List, Dict
from enum import Enum

class TargetPlatform(Enum):
    ANDROID = "android"
    IOS = "ios"
    RASPBERRY_PI = "raspberry_pi"
    ESP32 = "esp32"
    BROWSER = "browser"

@dataclass
class EdgeProfile:
    platform: TargetPlatform
    max_model_size_mb: float
    max_ram_mb: float
    supports_fp16: bool
    supports_int8: bool
    supports_int4: bool

class EdgeOptimizer:
    PLATFORM_PROFILES = {
        TargetPlatform.ANDROID: EdgeProfile(
            platform=TargetPlatform.ANDROID,
            max_model_size_mb=200, max_ram_mb=512,
            supports_fp16=True, supports_int8=True, supports_int4=True
        ),
        TargetPlatform.IOS: EdgeProfile(
            platform=TargetPlatform.IOS,
            max_model_size_mb=150, max_ram_mb=400,
            supports_fp16=True, supports_int8=True, supports_int4=False
        ),
        TargetPlatform.RASPBERRY_PI: EdgeProfile(
            platform=TargetPlatform.RASPBERRY_PI,
            max_model_size_mb=50, max_ram_mb=256,
            supports_fp16=True, supports_int8=True, supports_int4=True
        ),
        TargetPlatform.BROWSER: EdgeProfile(
            platform=TargetPlatform.BROWSER,
            max_model_size_mb=50, max_ram_mb=200,
            supports_fp16=True, supports_int8=False, supports_int4=False
        ),
    }

    def get_optimization_plan(self, platform: TargetPlatform,
                               model_size_mb: float) -> Dict:
        profile = self.PLATFORM_PROFILES[platform]
        plan = {"target": platform.value, "steps": []}
        if model_size_mb > profile.max_model_size_mb:
            reduction_needed = model_size_mb / profile.max_model_size_mb
            if profile.supports_int4 and reduction_needed > 4:
                plan["steps"].append({"type": "quantize", "bits": 4})
            elif profile.supports_int8:
                plan["steps"].append({"type": "quantize", "bits": 8})
            plan["steps"].append({"type": "prune", "sparsity": 0.3})
        if profile.supports_fp16:
            plan["steps"].append({"type": "convert_fp16": True})
        plan["steps"].append({"type": "graph_optimize"})
        return plan

    def estimate_memory(self, params: int, platform: TargetPlatform) -> Dict:
        profile = self.PLATFORM_PROFILES[platform]
        fp32_gb = params * 4 / (1024**3)
        fp16_gb = params * 2 / (1024**3)
        int8_gb = params * 1 / (1024**3)
        int4_gb = params * 0.5 / (1024**3)
        return {
            "fp32_gb": round(fp32_gb, 3),
            "fp16_gb": round(fp16_gb, 3),
            "int8_gb": round(int8_gb, 3),
            "int4_gb": round(int4_gb, 3),
            "fits_device": fp16_gb <= profile.max_ram_mb / 1024
        }

2. On-Device Inference Manager

import time
from dataclasses import dataclass
from typing import Optional, Dict

@dataclass
class InferenceResult:
    text: str
    tokens_generated: int
    latency_ms: float
    memory_used_mb: float
    tokens_per_second: float

class EdgeInferenceEngine:
    def __init__(self, model_path: str, max_tokens: int = 256):
        self.model_path = model_path
        self.max_tokens = max_tokens
        self.model_loaded = False

    def load_model(self) -> bool:
        self.model_loaded = True
        return True

    def generate(self, prompt: str, max_new_tokens: int = 100) -> InferenceResult:
        if not self.model_loaded:
            raise RuntimeError("Model not loaded")
        start = time.time()
        generated = self._run_inference(prompt, max_new_tokens)
        latency = (time.time() - start) * 1000
        tokens = len(generated.split())
        return InferenceResult(
            text=generated,
            tokens_generated=tokens,
            latency_ms=latency,
            memory_used_mb=50.0,
            tokens_per_second=tokens / max(latency / 1000, 0.001)
        )

    def _run_inference(self, prompt: str, max_tokens: int) -> str:
        return f"Generated response for: {prompt[:50]}..."

    def benchmark(self, prompts: List[str], iterations: int = 10) -> Dict:
        latencies = []
        tps_values = []
        for prompt in prompts:
            for _ in range(iterations):
                result = self.generate(prompt)
                latencies.append(result.latency_ms)
                tps_values.append(result.tokens_per_second)
        return {
            "avg_latency_ms": sum(latencies) / len(latencies),
            "p95_latency_ms": sorted(latencies)[int(0.95 * len(latencies))],
            "avg_tps": sum(tps_values) / len(tps_values)
        }

Key Formulas

Edge Memory Budget

M_{model} \leq M_{device} \times (1 - M_{os}) \times M_{reserved}

Here,

$M_{model}$ =Model memory requirement
$M_{device}$ =Total device RAM
$M_{os}$ =OS and system overhead fraction
$M_{reserved}$ =Safety margin fraction

On-Device Throughput

TPS = \frac{N_{tokens}}{T_{inference}}

Here,

$N_{tokens}$ =Number of tokens generated
$T_{inference}$ =Inference time in seconds

Platform Comparison

Platform	Max Model Size	RAM	Quantization	Runtime
iPhone 15	150MB	6GB	INT8, FP16	CoreML
Android Flagship	200MB	8GB	INT4, INT8, FP16	TFLite
Raspberry Pi 5	50MB	8GB	INT4, INT8	ONNX Runtime
ESP32	2MB	520KB	INT4	TensorFlow Lite Micro
Browser	50MB	Variable	FP16	ONNX.js, WebGPU

Best Practices

Profile target devices before choosing compression level
Use INT4 quantization for memory-constrained devices
Implement progressive loading for large models
Cache inference results to reduce redundant computation
Test on actual hardware not just emulators

LLM Edge Deployment: Mobile, IoT, and Edge Inference

LLM Edge Deployment: Mobile, IoT, and Edge Inference

Edge Deployment Pipeline

Edge Inference Engine

1. Model Optimization for Edge

2. On-Device Inference Manager

Key Formulas

Edge Memory Budget

On-Device Throughput

Platform Comparison

Best Practices

Premium Content

Need Expert AI Ops & LLM Ops Help?