Deploying NLP Models

Deploying NLP models to production requires optimization for latency, throughput, and resource efficiency while maintaining model quality.

Deployment Pipeline

Model Export Formats

Format	Framework	Runtime	Speed	Compatibility
ONNX	Any (via export)	ONNX Runtime	Fast	Universal
TorchScript	PyTorch	LibTorch	Fast	PyTorch ecosystem
SavedModel	TensorFlow	TF Serving	Fast	TF ecosystem
TensorRT	NVIDIA	TensorRT	Fastest	NVIDIA GPUs
SafeTensors	Any	Hugging Face	Fast	HF ecosystem

ONNX Export and Optimization

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

def export_to_onnx(model_name, output_path, opset_version=14):
    """Export a Hugging Face model to ONNX format."""
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model.eval()

    dummy_input = tokenizer(
        "This is a sample input for tracing.",
        return_tensors="pt",
        padding="max_length",
        max_length=128,
        truncation=True,
    )

    torch.onnx.export(
        model,
        (dummy_input["input_ids"], dummy_input["attention_mask"]),
        output_path,
        opset_version=opset_version,
        input_names=["input_ids", "attention_mask"],
        output_names=["logits"],
        dynamic_axes={
            "input_ids": {0: "batch_size", 1: "sequence_length"},
            "attention_mask": {0: "batch_size", 1: "sequence_length"},
            "logits": {0: "batch_size"},
        },
    )
    print(f"Model exported to {output_path}")

# Export
export_to_onnx("bert-base-uncased", "model.onnx")

# Optimize with onnxruntime
from onnxruntime import SessionOptions, InferenceSession
from onnxruntime.transformers import optimizer

optimized_model = optimizer.optimize_model(
    "model.onnx",
    model_type="bert",
    num_heads=12,
    hidden_size=768,
)
optimized_model.save_model_to_file("model_optimized.onnx")

Quantization

Reducing model precision from FP32 to INT8 or INT4 for faster inference.

DfQuantization Error

For a quantization function $Q(x)$ mapping from FP32 to INT8:

\text{Quantization Error} = \|x - Q(x)\|_2

The scale factor $s$ and zero point $z$ are computed as:

s = \frac{x_{\max} - x_{\min}}{2^b - 1}, \quad z = \text{round}\left(\frac{-x_{\min}}{s}\right)

where $b$ is the bit width (8 for INT8).

import torch
from torch.quantization import quantize_dynamic, QConfig

def quantize_model_dynamic(model, dtype=torch.qint8):
    """Apply dynamic quantization to a PyTorch model."""
    quantized_model = quantize_dynamic(
        model,
        {torch.nn.Linear, torch.nn.LSTM},
        dtype=dtype,
    )
    return quantized_model

def quantize_model_static(model, calibration_data):
    """Apply static quantization with calibration."""
    model.eval()
    model.qconfig = QConfig(
        weight=torch.quantization.default_per_channel_wqconfig,
        activation=torch.quantization.default_histogramobserver,
    )

    torch.quantization.prepare(model, inplace=True)

    # Calibration step
    with torch.no_grad():
        for batch in calibration_data:
            model(**batch)

    torch.quantization.convert(model, inplace=True)
    return model

# Hugging Face optimum quantization
from optimum.onnxruntime import ORTQuantizer, ORTModelForSequenceClassification

def quantize_hf_model(model_name, output_dir):
    """Quantize using Hugging Face Optimum."""
    model = ORTModelForSequenceClassification.from_pretrained(model_name, export=True)
    quantizer = ORTQuantizer.from_pretrained(model)

    quantizer.quantize(
        save_dir=output_dir,
        quantization_config=quantize_config,
    )

# Performance comparison
import time

def benchmark_inference(model, tokenizer, input_text, num_runs=100):
    """Benchmark inference latency."""
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

    # Warmup
    for _ in range(10):
        model(**inputs)

    # Benchmark
    start = time.time()
    for _ in range(num_runs):
        with torch.no_grad():
            model(**inputs)
    elapsed = (time.time() - start) / num_runs

    return {
        "latency_ms": elapsed * 1000,
        "throughput": 1 / elapsed,
    }

# Results (approximate):
# FP32: 45ms latency, 22 req/s
# INT8: 18ms latency, 55 req/s (2.5x speedup)
# INT4: 12ms latency, 83 req/s (3.8x speedup)

TensorRT Optimization

import tensorrt as trt

def build_trt_engine(onnx_path, engine_path, max_batch_size=32, fp16=True):
    """Build a TensorRT engine from ONNX model."""
    logger = trt.Logger(trt.Logger.WARNING)
    builder = trt.Builder(logger)
    network = builder.create_network(
        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    )
    parser = trt.OnnxParser(network, logger)

    # Parse ONNX model
    with open(onnx_path, "rb") as f:
        parser.parse(f.read())

    # Configure builder
    config = builder.create_builder_config()
    config.max_workspace_size = 4 * (1 << 30)  # 4GB

    if fp16 and builder.platform_has_fast_fp16:
        config.set_flag(trt.BuilderFlag.FP16)

    # Dynamic shapes for variable batch size
    profile = builder.create_optimization_profile()
    profile.set_shape("input_ids", (1, 1), (16, 128), (max_batch_size, 512))
    profile.set_shape("attention_mask", (1, 1), (16, 128), (max_batch_size, 512))
    config.add_optimization_profile(profile)

    # Build engine
    engine = builder.build_serialized_network(network, config)

    with open(engine_path, "wb") as f:
        f.write(engine)

    return engine

def infer_trt(engine, input_ids, attention_mask):
    """Run inference with TensorRT engine."""
    context = engine.create_execution_context()

    # Allocate buffers
    inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
    outputs = {}
    bindings = []

    for name in inputs:
        idx = engine.get_binding_index(name)
        shape = inputs[name].shape
        dtype = trt.nptype(engine.get_binding_dtype(idx))
        buffer = np.empty(shape, dtype=dtype)
        bindings.append(buffer.ctypes.data)
        outputs[name] = buffer

    # Run inference
    context.execute_async_v2(bindings, torch.cuda.current_stream().cuda_stream)
    return outputs

Serving Infrastructure

Solution	Type	Features	Best For
vLLM	LLM serving	PagedAttention, batching	Large LLMs
TGI	LLM serving	Continuous batching	Production LLMs
Triton	Multi-model	Dynamic batching, ensemble	Multi-model
FastAPI	Custom	Flexibility, simplicity	Custom logic
Ray Serve	Distributed	Scaling, composition	Complex pipelines

# vLLM serving example
from vllm import LLM, SamplingParams

def setup_vllm_server(model_name="meta-llama/Llama-2-7b-hf"):
    """Configure vLLM for high-throughput serving."""
    llm = LLM(
        model=model_name,
        tensor_parallel_size=1,       # Number of GPUs
        max_model_len=4096,           # Maximum sequence length
        gpu_memory_utilization=0.9,   # GPU memory fraction
        batch_size=8,                 # Maximum batch size
        trust_remote_code=True,
    )
    return llm

def generate_responses(llm, prompts, max_tokens=256, temperature=0.7):
    """Generate responses using vLLM."""
    sampling_params = SamplingParams(
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=0.9,
        frequency_penalty=0.1,
    )

    outputs = llm.generate(prompts, sampling_params)

    results = []
    for output in outputs:
        results.append({
            "prompt": output.prompt,
            "response": output.outputs[0].text,
            "tokens": len(output.outputs[0].token_ids),
        })

    return results

# Start server
llm = setup_vllm_server()
prompts = [
    "Explain quantum computing in simple terms.",
    "Write a Python function to sort a list.",
    "What are the benefits of exercise?",
]
results = generate_responses(llm, prompts)

Performance Comparison

Optimization	Latency Reduction	Throughput Gain	Accuracy Loss	Model Size
FP16	2x	2x	Negligible	50%
INT8	3x	3x	< 0.5%	25%
INT4	4x	4x	< 1%	12.5%
Pruning (50%)	2x	2x	< 1%	50%
Distillation	10x	10x	2-5%	10-30%

Key Takeaways

ONNX provides framework-agnostic optimization and broad runtime support
Quantization offers significant speedup with minimal accuracy loss
TensorRT achieves the best performance on NVIDIA hardware
vLLM is the preferred serving solution for large language models
Always benchmark on representative production workloads before deployment

Deploying NLP Models

Deploying NLP Models

Deployment Pipeline

Model Export Formats

ONNX Export and Optimization

Quantization

DfQuantization Error

TensorRT Optimization

Serving Infrastructure

Performance Comparison

Key Takeaways

Premium Content

Need Expert NLP Help?