πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

Deploying NLP Models

Production NLPModel Deployment and Optimization🟒 Free Lesson

Advertisement

Deploying NLP Models

Deploying NLP models to production requires optimization for latency, throughput, and resource efficiency while maintaining model quality.

Deployment Pipeline


Model Export Formats

FormatFrameworkRuntimeSpeedCompatibility
ONNXAny (via export)ONNX RuntimeFastUniversal
TorchScriptPyTorchLibTorchFastPyTorch ecosystem
SavedModelTensorFlowTF ServingFastTF ecosystem
TensorRTNVIDIATensorRTFastestNVIDIA GPUs
SafeTensorsAnyHugging FaceFastHF ecosystem

ONNX Export and Optimization

import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

def export_to_onnx(model_name, output_path, opset_version=14):
    """Export a Hugging Face model to ONNX format."""
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model.eval()

    dummy_input = tokenizer(
        "This is a sample input for tracing.",
        return_tensors="pt",
        padding="max_length",
        max_length=128,
        truncation=True,
    )

    torch.onnx.export(
        model,
        (dummy_input["input_ids"], dummy_input["attention_mask"]),
        output_path,
        opset_version=opset_version,
        input_names=["input_ids", "attention_mask"],
        output_names=["logits"],
        dynamic_axes={
            "input_ids": {0: "batch_size", 1: "sequence_length"},
            "attention_mask": {0: "batch_size", 1: "sequence_length"},
            "logits": {0: "batch_size"},
        },
    )
    print(f"Model exported to {output_path}")

# Export
export_to_onnx("bert-base-uncased", "model.onnx")

# Optimize with onnxruntime
from onnxruntime import SessionOptions, InferenceSession
from onnxruntime.transformers import optimizer

optimized_model = optimizer.optimize_model(
    "model.onnx",
    model_type="bert",
    num_heads=12,
    hidden_size=768,
)
optimized_model.save_model_to_file("model_optimized.onnx")

Quantization

Reducing model precision from FP32 to INT8 or INT4 for faster inference.

DfQuantization Error

For a quantization function Q(x)Q(x) mapping from FP32 to INT8:

QuantizationΒ Error=βˆ₯xβˆ’Q(x)βˆ₯2\text{Quantization Error} = \|x - Q(x)\|_2

The scale factor ss and zero point zz are computed as:

s=xmaxβ‘βˆ’xmin⁑2bβˆ’1,z=round(βˆ’xmin⁑s)s = \frac{x_{\max} - x_{\min}}{2^b - 1}, \quad z = \text{round}\left(\frac{-x_{\min}}{s}\right)

where bb is the bit width (8 for INT8).

import torch
from torch.quantization import quantize_dynamic, QConfig

def quantize_model_dynamic(model, dtype=torch.qint8):
    """Apply dynamic quantization to a PyTorch model."""
    quantized_model = quantize_dynamic(
        model,
        {torch.nn.Linear, torch.nn.LSTM},
        dtype=dtype,
    )
    return quantized_model

def quantize_model_static(model, calibration_data):
    """Apply static quantization with calibration."""
    model.eval()
    model.qconfig = QConfig(
        weight=torch.quantization.default_per_channel_wqconfig,
        activation=torch.quantization.default_histogramobserver,
    )

    torch.quantization.prepare(model, inplace=True)

    # Calibration step
    with torch.no_grad():
        for batch in calibration_data:
            model(**batch)

    torch.quantization.convert(model, inplace=True)
    return model

# Hugging Face optimum quantization
from optimum.onnxruntime import ORTQuantizer, ORTModelForSequenceClassification

def quantize_hf_model(model_name, output_dir):
    """Quantize using Hugging Face Optimum."""
    model = ORTModelForSequenceClassification.from_pretrained(model_name, export=True)
    quantizer = ORTQuantizer.from_pretrained(model)

    quantizer.quantize(
        save_dir=output_dir,
        quantization_config=quantize_config,
    )

# Performance comparison
import time

def benchmark_inference(model, tokenizer, input_text, num_runs=100):
    """Benchmark inference latency."""
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)

    # Warmup
    for _ in range(10):
        model(**inputs)

    # Benchmark
    start = time.time()
    for _ in range(num_runs):
        with torch.no_grad():
            model(**inputs)
    elapsed = (time.time() - start) / num_runs

    return {
        "latency_ms": elapsed * 1000,
        "throughput": 1 / elapsed,
    }

# Results (approximate):
# FP32: 45ms latency, 22 req/s
# INT8: 18ms latency, 55 req/s (2.5x speedup)
# INT4: 12ms latency, 83 req/s (3.8x speedup)

TensorRT Optimization

import tensorrt as trt

def build_trt_engine(onnx_path, engine_path, max_batch_size=32, fp16=True):
    """Build a TensorRT engine from ONNX model."""
    logger = trt.Logger(trt.Logger.WARNING)
    builder = trt.Builder(logger)
    network = builder.create_network(
        1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    )
    parser = trt.OnnxParser(network, logger)

    # Parse ONNX model
    with open(onnx_path, "rb") as f:
        parser.parse(f.read())

    # Configure builder
    config = builder.create_builder_config()
    config.max_workspace_size = 4 * (1 << 30)  # 4GB

    if fp16 and builder.platform_has_fast_fp16:
        config.set_flag(trt.BuilderFlag.FP16)

    # Dynamic shapes for variable batch size
    profile = builder.create_optimization_profile()
    profile.set_shape("input_ids", (1, 1), (16, 128), (max_batch_size, 512))
    profile.set_shape("attention_mask", (1, 1), (16, 128), (max_batch_size, 512))
    config.add_optimization_profile(profile)

    # Build engine
    engine = builder.build_serialized_network(network, config)

    with open(engine_path, "wb") as f:
        f.write(engine)

    return engine

def infer_trt(engine, input_ids, attention_mask):
    """Run inference with TensorRT engine."""
    context = engine.create_execution_context()

    # Allocate buffers
    inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
    outputs = {}
    bindings = []

    for name in inputs:
        idx = engine.get_binding_index(name)
        shape = inputs[name].shape
        dtype = trt.nptype(engine.get_binding_dtype(idx))
        buffer = np.empty(shape, dtype=dtype)
        bindings.append(buffer.ctypes.data)
        outputs[name] = buffer

    # Run inference
    context.execute_async_v2(bindings, torch.cuda.current_stream().cuda_stream)
    return outputs

Serving Infrastructure

SolutionTypeFeaturesBest For
vLLMLLM servingPagedAttention, batchingLarge LLMs
TGILLM servingContinuous batchingProduction LLMs
TritonMulti-modelDynamic batching, ensembleMulti-model
FastAPICustomFlexibility, simplicityCustom logic
Ray ServeDistributedScaling, compositionComplex pipelines
# vLLM serving example
from vllm import LLM, SamplingParams

def setup_vllm_server(model_name="meta-llama/Llama-2-7b-hf"):
    """Configure vLLM for high-throughput serving."""
    llm = LLM(
        model=model_name,
        tensor_parallel_size=1,       # Number of GPUs
        max_model_len=4096,           # Maximum sequence length
        gpu_memory_utilization=0.9,   # GPU memory fraction
        batch_size=8,                 # Maximum batch size
        trust_remote_code=True,
    )
    return llm

def generate_responses(llm, prompts, max_tokens=256, temperature=0.7):
    """Generate responses using vLLM."""
    sampling_params = SamplingParams(
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=0.9,
        frequency_penalty=0.1,
    )

    outputs = llm.generate(prompts, sampling_params)

    results = []
    for output in outputs:
        results.append({
            "prompt": output.prompt,
            "response": output.outputs[0].text,
            "tokens": len(output.outputs[0].token_ids),
        })

    return results

# Start server
llm = setup_vllm_server()
prompts = [
    "Explain quantum computing in simple terms.",
    "Write a Python function to sort a list.",
    "What are the benefits of exercise?",
]
results = generate_responses(llm, prompts)

Performance Comparison

OptimizationLatency ReductionThroughput GainAccuracy LossModel Size
FP162x2xNegligible50%
INT83x3x< 0.5%25%
INT44x4x< 1%12.5%
Pruning (50%)2x2x< 1%50%
Distillation10x10x2-5%10-30%

Key Takeaways

  • ONNX provides framework-agnostic optimization and broad runtime support
  • Quantization offers significant speedup with minimal accuracy loss
  • TensorRT achieves the best performance on NVIDIA hardware
  • vLLM is the preferred serving solution for large language models
  • Always benchmark on representative production workloads before deployment
⭐

Premium Content

Deploying NLP Models

Unlock this lesson and 900+ advanced tutorials with a Premium plan.

🎯End-to-end Projects
πŸ’ΌInterview Prep
πŸ“œCertificates
🀝Community Access

Already a member? Log in

Need Expert NLP Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement