LLM Feedback Loops: Human Feedback and RLHF Pipelines
Feedback loops are essential for aligning LLM behavior with human preferences. Production systems require structured feedback collection, efficient RLHF training pipelines, and continuous model improvement cycles.
Feedback Loop Pipeline
Feedback Collection System
1. Structured Feedback Collector
import uuid
import time
from dataclasses import dataclass, field
from typing import Optional, List, Dict, Any
from enum import Enum
class FeedbackType(Enum):
THUMBS_UP = "thumbs_up"
THUMBS_DOWN = "thumbs_down"
RATING = "rating"
TEXT_CORRECTION = "text_correction"
PREFERENCE = "preference"
@dataclass
class FeedbackEntry:
feedback_id: str
trace_id: str
model: str
prompt: str
completion: str
feedback_type: FeedbackType
value: Any
timestamp: float = field(default_factory=time.time)
user_id: Optional[str] = None
metadata: Dict[str, Any] = field(default_factory=dict)
class FeedbackCollector:
def __init__(self):
self.entries: List[FeedbackEntry] = []
def record_thumbs(self, trace_id: str, model: str, prompt: str,
completion: str, thumbs_up: bool,
user_id: Optional[str] = None) -> FeedbackEntry:
feedback_type = FeedbackType.THUMBS_UP if thumbs_up else FeedbackType.THUMBS_DOWN
entry = FeedbackEntry(
feedback_id=str(uuid.uuid4()),
trace_id=trace_id,
model=model,
prompt=prompt,
completion=completion,
feedback_type=feedback_type,
value=thumbs_up,
user_id=user_id
)
self.entries.append(entry)
return entry
def record_rating(self, trace_id: str, model: str, prompt: str,
completion: str, rating: int,
user_id: Optional[str] = None) -> FeedbackEntry:
entry = FeedbackEntry(
feedback_id=str(uuid.uuid4()),
trace_id=trace_id,
model=model,
prompt=prompt,
completion=completion,
feedback_type=FeedbackType.RATING,
value=max(1, min(5, rating)),
user_id=user_id
)
self.entries.append(entry)
return entry
def record_correction(self, trace_id: str, model: str, prompt: str,
original_completion: str, corrected_completion: str,
user_id: Optional[str] = None) -> FeedbackEntry:
entry = FeedbackEntry(
feedback_id=str(uuid.uuid4()),
trace_id=trace_id,
model=model,
prompt=prompt,
completion=original_completion,
feedback_type=FeedbackType.TEXT_CORRECTION,
value={"original": original_completion, "corrected": corrected_completion},
user_id=user_id
)
self.entries.append(entry)
return entry
def get_positive_pairs(self) -> List[Dict]:
return [
{"prompt": e.prompt, "chosen": e.completion}
for e in self.entries
if e.feedback_type == FeedbackType.THUMBS_UP and e.value
]
def get_negative_pairs(self) -> List[Dict]:
return [
{"prompt": e.prompt, "rejected": e.completion}
for e in self.entries
if e.feedback_type == FeedbackType.THUMBS_DOWN
]
def get_preference_pairs(self) -> List[Dict]:
pairs = []
for e in self.entries:
if e.feedback_type == FeedbackType.PREFERENCE:
pairs.append({"prompt": e.prompt, **e.value})
return pairs
2. RLHF Training Pipeline
import numpy as np
from dataclasses import dataclass
from typing import List, Tuple
@dataclass
class RewardModel:
model_name: str
version: int
training_loss: float
accuracy: float
class RLHFPipeline:
def __init__(self, reward_model: RewardModel):
self.reward_model = reward_model
self.training_history: List[dict] = []
def compute_reward(self, prompt: str, completion: str) -> float:
# Placeholder for actual reward model inference
# In production, this calls the reward model
return 0.5
def compute_rewards_batch(self, prompts: List[str],
completions: List[str]) -> List[float]:
return [self.compute_reward(p, c) for p, c in zip(prompts, completions)]
def compute_advantages(self, rewards: List[float],
baseline: float = 0.0) -> List[float]:
return [r - baseline for r in rewards]
def ppo_loss(self, old_logprobs: List[float], new_logprobs: List[float],
advantages: List[float], clip_ratio: float = 0.2) -> float:
ratios = [np.exp(new - old) for new, old in zip(new_logprobs, old_logprobs)]
clipped = [
min(r * a, np.clip(r, 1 - clip_ratio, 1 + clip_ratio) * a)
for r, a in zip(ratios, advantages)
]
return -np.mean(clipped)
def compute_kl_penalty(self, ref_logprobs: List[float],
new_logprobs: List[float],
beta: float = 0.1) -> float:
kl = sum(new - ref for new, ref in zip(new_logprobs, ref_logprobs))
return beta * kl
Key Formulas
Reward Model Objective
Here,
- =Input prompt
- =Preferred (chosen) completion
- =Rejected completion
- =Reward model parameterized by theta
RLHF Objective
Here,
- =Policy being optimized
- =Reference policy (SFT model)
- =Reward model
- =KL penalty coefficient
KL Divergence
Here,
- =Reference distribution
- =Updated policy distribution
Feedback Metrics
| Metric | Description | Target |
|---|---|---|
| Feedback Rate | % of interactions with feedback | >5% |
| Positive Ratio | % of positive feedback | >70% |
| Correction Rate | % of responses needing correction | <10% |
| Label Consistency | Inter-annotator agreement | >80% |
| RLHF Improvement | Reward model accuracy gain | >10% |
Best Practices
- Collect diverse feedback across user demographics and use cases
- Balance positive and negative samples for reward model training
- Use human annotation for high-stakes domains
- Monitor reward hacking where models exploit reward model weaknesses
- Retrain reward models periodically to prevent distribution shift