Why Guardrails Matter
LLMs can generate harmful, misleading, or inappropriate content. Guardrails are safety mechanisms that filter inputs and outputs to ensure responsible AI deployment.
Input Guardrails
Content Classification
from dataclasses import dataclass
from enum import Enum
class SafetyVerdict(Enum):
SAFE = "safe"
UNSAFE = "unsafe"
UNCERTAIN = "uncertain"
@dataclass
class GuardrailResult:
verdict: SafetyVerdict
category: str
confidence: float
details: dict
class ContentClassifier:
def __init__(self, model):
self.model = model
self.categories = [
"violence", "hate_speech", "self_harm",
"sexual_content", "illegal_activity", "misinformation"
]
def classify(self, text: str) -> list[GuardrailResult]:
prompt = f"""Classify this text for safety concerns.
For each category, provide: safe/unsafe, confidence (0-1), and explanation.
Text: {text}
Categories: {', '.join(self.categories)}
Response format: JSON list of {{category, verdict, confidence, explanation}}"""
response = self.model.generate(prompt)
return self._parse_results(response)
def _parse_results(self, response: str) -> list[GuardrailResult]:
import json
try:
data = json.loads(response)
return [
GuardrailResult(
verdict=SafetyVerdict(item["verdict"]),
category=item["category"],
confidence=item["confidence"],
details={"explanation": item.get("explanation", "")}
)
for item in data
]
except:
return [GuardrailResult(SafetyVerdict.UNCERTAIN, "parse_error", 0.5, {})]
PII Detection and Masking
import re
class PIIDetector:
def __init__(self):
self.patterns = {
"email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
"phone": r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
"ssn": r'\b\d{3}-\d{2}-\d{4}\b',
"credit_card": r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b',
"ip_address": r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b',
}
def detect(self, text: str) -> dict[str, list[str]]:
findings = {}
for pii_type, pattern in self.patterns.items():
matches = re.findall(pattern, text)
if matches:
findings[pii_type] = matches
return findings
def mask(self, text: str) -> str:
for pii_type, pattern in self.patterns.items():
text = re.sub(pattern, f"[{pii_type.upper()}]", text)
return text
class PIIGuardrail:
def __init__(self, detector: PIIDetector, action: str = "mask"):
self.detector = detector
self.action = action # "mask", "block", "flag"
def check(self, text: str) -> tuple[str, dict]:
findings = self.detector.detect(text)
if not findings:
return text, {"blocked": False, "pii_found": False}
if self.action == "block":
return "", {"blocked": True, "pii_found": True, "findings": findings}
elif self.action == "mask":
masked = self.detector.mask(text)
return masked, {"blocked": False, "pii_found": True, "findings": findings}
else: # flag
return text, {"blocked": False, "pii_found": True, "findings": findings}
Prompt Injection Detection
class InjectionDetector:
def __init__(self, classifier):
self.classifier = classifier
self.suspicious_patterns = [
"ignore previous instructions",
"you are now",
"forget everything",
"new instructions:",
"system prompt:",
"act as",
"pretend you are",
"roleplay as",
]
def detect(self, prompt: str) -> dict:
# Pattern-based detection
prompt_lower = prompt.lower()
pattern_matches = [
p for p in self.suspicious_patterns
if p in prompt_lower
]
# ML-based classification
classification = self.classifier.classify(prompt)
is_injection = (
len(pattern_matches) > 0 or
any(c.category == "injection" and c.confidence > 0.7 for c in classification)
)
return {
"is_injection": is_injection,
"pattern_matches": pattern_matches,
"classification": classification,
"risk_score": min(1.0, len(pattern_matches) * 0.3 + sum(c.confidence for c in classification) / len(classification))
}
Output Guardrails
Hallucination Detection
class HallucinationDetector:
def __init__(self, nli_model):
self.nli_model = nli_model # Natural Language Inference model
def check_faithfulness(self, context: str, response: str) -> dict:
# Split response into claims
claims = self._extract_claims(response)
supported = 0
unsupported = 0
for claim in claims:
result = self.nli_model.predict(context, claim)
if result == "entailment":
supported += 1
else:
unsupported += 1
return {
"faithfulness_score": supported / len(claims) if claims else 0,
"total_claims": len(claims),
"supported": supported,
"unsupported": unsupported
}
def _extract_claims(self, text: str) -> list[str]:
sentences = text.split('.')
return [s.strip() for s in sentences if len(s.strip()) > 10]
Output Format Validation
class OutputValidator:
def __init__(self, schemas: dict):
self.schemas = schemas
def validate(self, output: str, expected_format: str) -> dict:
if expected_format == "json":
return self._validate_json(output)
elif expected_format == "email":
return self._validate_email(output)
elif expected_format == "code":
return self._validate_code(output)
return {"valid": True}
def _validate_json(self, output: str) -> dict:
import json
try:
data = json.loads(output)
return {"valid": True, "parsed": data}
except json.JSONDecodeError as e:
return {"valid": False, "error": str(e)}
Guardrails Framework
class GuardrailsPipeline:
def __init__(self):
self.input_guards = []
self.output_guards = []
def add_input_guard(self, guard):
self.input_guards.append(guard)
def add_output_guard(self, guard):
self.output_guards.append(guard)
def check_input(self, text: str) -> tuple[str, list[dict]]:
results = []
processed_text = text
for guard in self.input_guards:
processed_text, result = guard.check(processed_text)
results.append(result)
if result.get("blocked"):
return "", results
return processed_text, results
def check_output(self, text: str, context: str = "") -> tuple[str, list[dict]]:
results = []
processed_text = text
for guard in self.output_guards:
processed_text, result = guard.check(processed_text, context)
results.append(result)
if result.get("blocked"):
return "", results
return processed_text, results
Safety Metrics
| Metric | Definition | Target |
|---|---|---|
| Harmful Output Rate | % of unsafe outputs | <0.1% |
| PII Leakage Rate | % of outputs with exposed PII | 0% |
| Injection Success Rate | % of successful jailbreaks | <1% |
| False Positive Rate | % of safe content blocked | <5% |
| Latency Overhead | Added latency from guardrails | <100ms |
Effective guardrails balance safety with usability, implementing multiple defense layers without excessive friction for legitimate use cases.