LLM Guardrails

Why Guardrails Matter

LLMs can generate harmful, misleading, or inappropriate content. Guardrails are safety mechanisms that filter inputs and outputs to ensure responsible AI deployment.

Input Guardrails

Content Classification

from dataclasses import dataclass
from enum import Enum

class SafetyVerdict(Enum):
    SAFE = "safe"
    UNSAFE = "unsafe"
    UNCERTAIN = "uncertain"

@dataclass
class GuardrailResult:
    verdict: SafetyVerdict
    category: str
    confidence: float
    details: dict

class ContentClassifier:
    def __init__(self, model):
        self.model = model
        self.categories = [
            "violence", "hate_speech", "self_harm",
            "sexual_content", "illegal_activity", "misinformation"
        ]

    def classify(self, text: str) -> list[GuardrailResult]:
        prompt = f"""Classify this text for safety concerns.
        For each category, provide: safe/unsafe, confidence (0-1), and explanation.

        Text: {text}

        Categories: {', '.join(self.categories)}
        Response format: JSON list of {{category, verdict, confidence, explanation}}"""

        response = self.model.generate(prompt)
        return self._parse_results(response)

    def _parse_results(self, response: str) -> list[GuardrailResult]:
        import json
        try:
            data = json.loads(response)
            return [
                GuardrailResult(
                    verdict=SafetyVerdict(item["verdict"]),
                    category=item["category"],
                    confidence=item["confidence"],
                    details={"explanation": item.get("explanation", "")}
                )
                for item in data
            ]
        except:
            return [GuardrailResult(SafetyVerdict.UNCERTAIN, "parse_error", 0.5, {})]

PII Detection and Masking

import re

class PIIDetector:
    def __init__(self):
        self.patterns = {
            "email": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
            "phone": r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b',
            "ssn": r'\b\d{3}-\d{2}-\d{4}\b',
            "credit_card": r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b',
            "ip_address": r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b',
        }

    def detect(self, text: str) -> dict[str, list[str]]:
        findings = {}
        for pii_type, pattern in self.patterns.items():
            matches = re.findall(pattern, text)
            if matches:
                findings[pii_type] = matches
        return findings

    def mask(self, text: str) -> str:
        for pii_type, pattern in self.patterns.items():
            text = re.sub(pattern, f"[{pii_type.upper()}]", text)
        return text

class PIIGuardrail:
    def __init__(self, detector: PIIDetector, action: str = "mask"):
        self.detector = detector
        self.action = action  # "mask", "block", "flag"

    def check(self, text: str) -> tuple[str, dict]:
        findings = self.detector.detect(text)

        if not findings:
            return text, {"blocked": False, "pii_found": False}

        if self.action == "block":
            return "", {"blocked": True, "pii_found": True, "findings": findings}
        elif self.action == "mask":
            masked = self.detector.mask(text)
            return masked, {"blocked": False, "pii_found": True, "findings": findings}
        else:  # flag
            return text, {"blocked": False, "pii_found": True, "findings": findings}

Prompt Injection Detection

class InjectionDetector:
    def __init__(self, classifier):
        self.classifier = classifier
        self.suspicious_patterns = [
            "ignore previous instructions",
            "you are now",
            "forget everything",
            "new instructions:",
            "system prompt:",
            "act as",
            "pretend you are",
            "roleplay as",
        ]

    def detect(self, prompt: str) -> dict:
        # Pattern-based detection
        prompt_lower = prompt.lower()
        pattern_matches = [
            p for p in self.suspicious_patterns
            if p in prompt_lower
        ]

        # ML-based classification
        classification = self.classifier.classify(prompt)

        is_injection = (
            len(pattern_matches) > 0 or
            any(c.category == "injection" and c.confidence > 0.7 for c in classification)
        )

        return {
            "is_injection": is_injection,
            "pattern_matches": pattern_matches,
            "classification": classification,
            "risk_score": min(1.0, len(pattern_matches) * 0.3 + sum(c.confidence for c in classification) / len(classification))
        }

Output Guardrails

Hallucination Detection

class HallucinationDetector:
    def __init__(self, nli_model):
        self.nli_model = nli_model  # Natural Language Inference model

    def check_faithfulness(self, context: str, response: str) -> dict:
        # Split response into claims
        claims = self._extract_claims(response)

        supported = 0
        unsupported = 0
        for claim in claims:
            result = self.nli_model.predict(context, claim)
            if result == "entailment":
                supported += 1
            else:
                unsupported += 1

        return {
            "faithfulness_score": supported / len(claims) if claims else 0,
            "total_claims": len(claims),
            "supported": supported,
            "unsupported": unsupported
        }

    def _extract_claims(self, text: str) -> list[str]:
        sentences = text.split('.')
        return [s.strip() for s in sentences if len(s.strip()) > 10]

Output Format Validation

class OutputValidator:
    def __init__(self, schemas: dict):
        self.schemas = schemas

    def validate(self, output: str, expected_format: str) -> dict:
        if expected_format == "json":
            return self._validate_json(output)
        elif expected_format == "email":
            return self._validate_email(output)
        elif expected_format == "code":
            return self._validate_code(output)
        return {"valid": True}

    def _validate_json(self, output: str) -> dict:
        import json
        try:
            data = json.loads(output)
            return {"valid": True, "parsed": data}
        except json.JSONDecodeError as e:
            return {"valid": False, "error": str(e)}

Guardrails Framework

class GuardrailsPipeline:
    def __init__(self):
        self.input_guards = []
        self.output_guards = []

    def add_input_guard(self, guard):
        self.input_guards.append(guard)

    def add_output_guard(self, guard):
        self.output_guards.append(guard)

    def check_input(self, text: str) -> tuple[str, list[dict]]:
        results = []
        processed_text = text
        for guard in self.input_guards:
            processed_text, result = guard.check(processed_text)
            results.append(result)
            if result.get("blocked"):
                return "", results
        return processed_text, results

    def check_output(self, text: str, context: str = "") -> tuple[str, list[dict]]:
        results = []
        processed_text = text
        for guard in self.output_guards:
            processed_text, result = guard.check(processed_text, context)
            results.append(result)
            if result.get("blocked"):
                return "", results
        return processed_text, results

Safety Metrics

Metric	Definition	Target
Harmful Output Rate	% of unsafe outputs	<0.1%
PII Leakage Rate	% of outputs with exposed PII	0%
Injection Success Rate	% of successful jailbreaks	<1%
False Positive Rate	% of safe content blocked	<5%
Latency Overhead	Added latency from guardrails	<100ms

Effective guardrails balance safety with usability, implementing multiple defense layers without excessive friction for legitimate use cases.