LLM Feedback Loops: Human Feedback and RLHF Pipelines

Feedback loops are essential for aligning LLM behavior with human preferences. Production systems require structured feedback collection, efficient RLHF training pipelines, and continuous model improvement cycles.

Feedback Loop Pipeline

Feedback Collection System

1. Structured Feedback Collector

import uuid
import time
from dataclasses import dataclass, field
from typing import Optional, List, Dict, Any
from enum import Enum

class FeedbackType(Enum):
    THUMBS_UP = "thumbs_up"
    THUMBS_DOWN = "thumbs_down"
    RATING = "rating"
    TEXT_CORRECTION = "text_correction"
    PREFERENCE = "preference"

@dataclass
class FeedbackEntry:
    feedback_id: str
    trace_id: str
    model: str
    prompt: str
    completion: str
    feedback_type: FeedbackType
    value: Any
    timestamp: float = field(default_factory=time.time)
    user_id: Optional[str] = None
    metadata: Dict[str, Any] = field(default_factory=dict)

class FeedbackCollector:
    def __init__(self):
        self.entries: List[FeedbackEntry] = []

    def record_thumbs(self, trace_id: str, model: str, prompt: str,
                      completion: str, thumbs_up: bool,
                      user_id: Optional[str] = None) -> FeedbackEntry:
        feedback_type = FeedbackType.THUMBS_UP if thumbs_up else FeedbackType.THUMBS_DOWN
        entry = FeedbackEntry(
            feedback_id=str(uuid.uuid4()),
            trace_id=trace_id,
            model=model,
            prompt=prompt,
            completion=completion,
            feedback_type=feedback_type,
            value=thumbs_up,
            user_id=user_id
        )
        self.entries.append(entry)
        return entry

    def record_rating(self, trace_id: str, model: str, prompt: str,
                      completion: str, rating: int,
                      user_id: Optional[str] = None) -> FeedbackEntry:
        entry = FeedbackEntry(
            feedback_id=str(uuid.uuid4()),
            trace_id=trace_id,
            model=model,
            prompt=prompt,
            completion=completion,
            feedback_type=FeedbackType.RATING,
            value=max(1, min(5, rating)),
            user_id=user_id
        )
        self.entries.append(entry)
        return entry

    def record_correction(self, trace_id: str, model: str, prompt: str,
                          original_completion: str, corrected_completion: str,
                          user_id: Optional[str] = None) -> FeedbackEntry:
        entry = FeedbackEntry(
            feedback_id=str(uuid.uuid4()),
            trace_id=trace_id,
            model=model,
            prompt=prompt,
            completion=original_completion,
            feedback_type=FeedbackType.TEXT_CORRECTION,
            value={"original": original_completion, "corrected": corrected_completion},
            user_id=user_id
        )
        self.entries.append(entry)
        return entry

    def get_positive_pairs(self) -> List[Dict]:
        return [
            {"prompt": e.prompt, "chosen": e.completion}
            for e in self.entries
            if e.feedback_type == FeedbackType.THUMBS_UP and e.value
        ]

    def get_negative_pairs(self) -> List[Dict]:
        return [
            {"prompt": e.prompt, "rejected": e.completion}
            for e in self.entries
            if e.feedback_type == FeedbackType.THUMBS_DOWN
        ]

    def get_preference_pairs(self) -> List[Dict]:
        pairs = []
        for e in self.entries:
            if e.feedback_type == FeedbackType.PREFERENCE:
                pairs.append({"prompt": e.prompt, **e.value})
        return pairs

2. RLHF Training Pipeline

import numpy as np
from dataclasses import dataclass
from typing import List, Tuple

@dataclass
class RewardModel:
    model_name: str
    version: int
    training_loss: float
    accuracy: float

class RLHFPipeline:
    def __init__(self, reward_model: RewardModel):
        self.reward_model = reward_model
        self.training_history: List[dict] = []

    def compute_reward(self, prompt: str, completion: str) -> float:
        # Placeholder for actual reward model inference
        # In production, this calls the reward model
        return 0.5

    def compute_rewards_batch(self, prompts: List[str],
                              completions: List[str]) -> List[float]:
        return [self.compute_reward(p, c) for p, c in zip(prompts, completions)]

    def compute_advantages(self, rewards: List[float],
                           baseline: float = 0.0) -> List[float]:
        return [r - baseline for r in rewards]

    def ppo_loss(self, old_logprobs: List[float], new_logprobs: List[float],
                 advantages: List[float], clip_ratio: float = 0.2) -> float:
        ratios = [np.exp(new - old) for new, old in zip(new_logprobs, old_logprobs)]
        clipped = [
            min(r * a, np.clip(r, 1 - clip_ratio, 1 + clip_ratio) * a)
            for r, a in zip(ratios, advantages)
        ]
        return -np.mean(clipped)

    def compute_kl_penalty(self, ref_logprobs: List[float],
                           new_logprobs: List[float],
                           beta: float = 0.1) -> float:
        kl = sum(new - ref for new, ref in zip(new_logprobs, ref_logprobs))
        return beta * kl

Key Formulas

Reward Model Objective

\mathcal{L}_{RM} = -\mathbb{E}_{(x, y_w, y_l) \sim D} \left[ \log \sigma(r_\theta(x, y_w) - r_\theta(x, y_l)) \right]

Here,

$x$ =Input prompt
$y_w$ =Preferred (chosen) completion
$y_l$ =Rejected completion
$r_\theta$ =Reward model parameterized by theta

RLHF Objective

\max_{\pi_\theta} \mathbb{E}_{x \sim D, y \sim \pi_\theta(y|x)} \left[ r_\phi(x, y) - \beta \cdot D_{KL}(\pi_\theta || \pi_{ref}) \right]

Here,

$\pi_\theta$ =Policy being optimized
$\pi_{ref}$ =Reference policy (SFT model)
$r_\phi$ =Reward model
$\beta$ =KL penalty coefficient

KL Divergence

D_{KL}(P || Q) = \sum_{x} P(x) \log \frac{P(x)}{Q(x)}

Here,

$P$ =Reference distribution
$Q$ =Updated policy distribution

Feedback Metrics

Metric	Description	Target
Feedback Rate	% of interactions with feedback	>5%
Positive Ratio	% of positive feedback	>70%
Correction Rate	% of responses needing correction	<10%
Label Consistency	Inter-annotator agreement	>80%
RLHF Improvement	Reward model accuracy gain	>10%

Best Practices

Collect diverse feedback across user demographics and use cases
Balance positive and negative samples for reward model training
Use human annotation for high-stakes domains
Monitor reward hacking where models exploit reward model weaknesses
Retrain reward models periodically to prevent distribution shift

LLM Feedback Loops: Human Feedback and RLHF Pipelines

LLM Feedback Loops: Human Feedback and RLHF Pipelines

Feedback Loop Pipeline

Feedback Collection System

1. Structured Feedback Collector

2. RLHF Training Pipeline

Key Formulas

Reward Model Objective

RLHF Objective

KL Divergence

Feedback Metrics

Best Practices

Premium Content

Need Expert AI Ops & LLM Ops Help?