Deploying NLP Models
Deploying NLP models to production requires optimization for latency, throughput, and resource efficiency while maintaining model quality.
Deployment Pipeline
Model Export Formats
| Format | Framework | Runtime | Speed | Compatibility |
|---|---|---|---|---|
| ONNX | Any (via export) | ONNX Runtime | Fast | Universal |
| TorchScript | PyTorch | LibTorch | Fast | PyTorch ecosystem |
| SavedModel | TensorFlow | TF Serving | Fast | TF ecosystem |
| TensorRT | NVIDIA | TensorRT | Fastest | NVIDIA GPUs |
| SafeTensors | Any | Hugging Face | Fast | HF ecosystem |
ONNX Export and Optimization
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
def export_to_onnx(model_name, output_path, opset_version=14):
"""Export a Hugging Face model to ONNX format."""
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.eval()
dummy_input = tokenizer(
"This is a sample input for tracing.",
return_tensors="pt",
padding="max_length",
max_length=128,
truncation=True,
)
torch.onnx.export(
model,
(dummy_input["input_ids"], dummy_input["attention_mask"]),
output_path,
opset_version=opset_version,
input_names=["input_ids", "attention_mask"],
output_names=["logits"],
dynamic_axes={
"input_ids": {0: "batch_size", 1: "sequence_length"},
"attention_mask": {0: "batch_size", 1: "sequence_length"},
"logits": {0: "batch_size"},
},
)
print(f"Model exported to {output_path}")
# Export
export_to_onnx("bert-base-uncased", "model.onnx")
# Optimize with onnxruntime
from onnxruntime import SessionOptions, InferenceSession
from onnxruntime.transformers import optimizer
optimized_model = optimizer.optimize_model(
"model.onnx",
model_type="bert",
num_heads=12,
hidden_size=768,
)
optimized_model.save_model_to_file("model_optimized.onnx")
Quantization
Reducing model precision from FP32 to INT8 or INT4 for faster inference.
DfQuantization Error
For a quantization function mapping from FP32 to INT8:
The scale factor and zero point are computed as:
where is the bit width (8 for INT8).
import torch
from torch.quantization import quantize_dynamic, QConfig
def quantize_model_dynamic(model, dtype=torch.qint8):
"""Apply dynamic quantization to a PyTorch model."""
quantized_model = quantize_dynamic(
model,
{torch.nn.Linear, torch.nn.LSTM},
dtype=dtype,
)
return quantized_model
def quantize_model_static(model, calibration_data):
"""Apply static quantization with calibration."""
model.eval()
model.qconfig = QConfig(
weight=torch.quantization.default_per_channel_wqconfig,
activation=torch.quantization.default_histogramobserver,
)
torch.quantization.prepare(model, inplace=True)
# Calibration step
with torch.no_grad():
for batch in calibration_data:
model(**batch)
torch.quantization.convert(model, inplace=True)
return model
# Hugging Face optimum quantization
from optimum.onnxruntime import ORTQuantizer, ORTModelForSequenceClassification
def quantize_hf_model(model_name, output_dir):
"""Quantize using Hugging Face Optimum."""
model = ORTModelForSequenceClassification.from_pretrained(model_name, export=True)
quantizer = ORTQuantizer.from_pretrained(model)
quantizer.quantize(
save_dir=output_dir,
quantization_config=quantize_config,
)
# Performance comparison
import time
def benchmark_inference(model, tokenizer, input_text, num_runs=100):
"""Benchmark inference latency."""
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
# Warmup
for _ in range(10):
model(**inputs)
# Benchmark
start = time.time()
for _ in range(num_runs):
with torch.no_grad():
model(**inputs)
elapsed = (time.time() - start) / num_runs
return {
"latency_ms": elapsed * 1000,
"throughput": 1 / elapsed,
}
# Results (approximate):
# FP32: 45ms latency, 22 req/s
# INT8: 18ms latency, 55 req/s (2.5x speedup)
# INT4: 12ms latency, 83 req/s (3.8x speedup)
TensorRT Optimization
import tensorrt as trt
def build_trt_engine(onnx_path, engine_path, max_batch_size=32, fp16=True):
"""Build a TensorRT engine from ONNX model."""
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
network = builder.create_network(
1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
)
parser = trt.OnnxParser(network, logger)
# Parse ONNX model
with open(onnx_path, "rb") as f:
parser.parse(f.read())
# Configure builder
config = builder.create_builder_config()
config.max_workspace_size = 4 * (1 << 30) # 4GB
if fp16 and builder.platform_has_fast_fp16:
config.set_flag(trt.BuilderFlag.FP16)
# Dynamic shapes for variable batch size
profile = builder.create_optimization_profile()
profile.set_shape("input_ids", (1, 1), (16, 128), (max_batch_size, 512))
profile.set_shape("attention_mask", (1, 1), (16, 128), (max_batch_size, 512))
config.add_optimization_profile(profile)
# Build engine
engine = builder.build_serialized_network(network, config)
with open(engine_path, "wb") as f:
f.write(engine)
return engine
def infer_trt(engine, input_ids, attention_mask):
"""Run inference with TensorRT engine."""
context = engine.create_execution_context()
# Allocate buffers
inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
outputs = {}
bindings = []
for name in inputs:
idx = engine.get_binding_index(name)
shape = inputs[name].shape
dtype = trt.nptype(engine.get_binding_dtype(idx))
buffer = np.empty(shape, dtype=dtype)
bindings.append(buffer.ctypes.data)
outputs[name] = buffer
# Run inference
context.execute_async_v2(bindings, torch.cuda.current_stream().cuda_stream)
return outputs
Serving Infrastructure
| Solution | Type | Features | Best For |
|---|---|---|---|
| vLLM | LLM serving | PagedAttention, batching | Large LLMs |
| TGI | LLM serving | Continuous batching | Production LLMs |
| Triton | Multi-model | Dynamic batching, ensemble | Multi-model |
| FastAPI | Custom | Flexibility, simplicity | Custom logic |
| Ray Serve | Distributed | Scaling, composition | Complex pipelines |
# vLLM serving example
from vllm import LLM, SamplingParams
def setup_vllm_server(model_name="meta-llama/Llama-2-7b-hf"):
"""Configure vLLM for high-throughput serving."""
llm = LLM(
model=model_name,
tensor_parallel_size=1, # Number of GPUs
max_model_len=4096, # Maximum sequence length
gpu_memory_utilization=0.9, # GPU memory fraction
batch_size=8, # Maximum batch size
trust_remote_code=True,
)
return llm
def generate_responses(llm, prompts, max_tokens=256, temperature=0.7):
"""Generate responses using vLLM."""
sampling_params = SamplingParams(
max_tokens=max_tokens,
temperature=temperature,
top_p=0.9,
frequency_penalty=0.1,
)
outputs = llm.generate(prompts, sampling_params)
results = []
for output in outputs:
results.append({
"prompt": output.prompt,
"response": output.outputs[0].text,
"tokens": len(output.outputs[0].token_ids),
})
return results
# Start server
llm = setup_vllm_server()
prompts = [
"Explain quantum computing in simple terms.",
"Write a Python function to sort a list.",
"What are the benefits of exercise?",
]
results = generate_responses(llm, prompts)
Performance Comparison
| Optimization | Latency Reduction | Throughput Gain | Accuracy Loss | Model Size |
|---|---|---|---|---|
| FP16 | 2x | 2x | Negligible | 50% |
| INT8 | 3x | 3x | < 0.5% | 25% |
| INT4 | 4x | 4x | < 1% | 12.5% |
| Pruning (50%) | 2x | 2x | < 1% | 50% |
| Distillation | 10x | 10x | 2-5% | 10-30% |
Key Takeaways
- ONNX provides framework-agnostic optimization and broad runtime support
- Quantization offers significant speedup with minimal accuracy loss
- TensorRT achieves the best performance on NVIDIA hardware
- vLLM is the preferred serving solution for large language models
- Always benchmark on representative production workloads before deployment