Fine-Tuning in LLMOps
Fine-tuning adapts pre-trained LLMs to specific tasks or domains. It requires specialized infrastructure for handling billion-parameter models and training data management.
Fine-Tuning Methods
Full Fine-Tuning
Updates all model parameters. Requires significant GPU memory and compute.
DfFull Fine-Tuning Memory
For a model with P parameters in FP16:
\text{Memory}_{full} = 2P \text{ (weights)} + 2P \text{ (gradients)} + 6P \text{ (Adam states)} = 10P \text{ bytes}
For a 7B parameter model: ~70GB GPU memory required.
LoRA (Low-Rank Adaptation)
Freezes original weights and trains low-rank decomposition matrices, reducing trainable parameters dramatically.
W' = W + \Delta W = W + BA
Where B β β^{dΓr} and A β β^{rΓk}, with rank r βͺ min(d, k).
from peft import LoraConfig, get_peft_model, TaskType
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=16, # Rank of decomposition
lora_alpha=32, # Scaling factor
lora_dropout=0.1,
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
bias="none",
modules_to_save=["lm_head"] # Train these fully
)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
torch_dtype=torch.bfloat16,
device_map="auto"
)
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()
# trainable params: 13,631,488 || all params: 6,744,647,680 || trainable%: 0.20%
QLoRA
Combines quantization with LoRA, enabling fine-tuning on a single GPU.
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
# 4-bit quantized base model
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b-hf",
quantization_config=bnb_config,
device_map="auto"
)
lora_config = LoraConfig(
r=64,
lora_alpha=16,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
# Total memory: ~10GB (vs ~70GB for full fine-tuning)
Distributed Training
Data Parallelism (DDP)
Each GPU holds a complete model copy; gradients are synchronized across GPUs.
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
def setup_ddp(rank, world_size):
dist.init_process_group(
backend="nccl",
init_method="env://",
world_size=world_size,
rank=rank
)
torch.cuda.set_device(rank)
def train_ddp(rank, world_size):
setup_ddp(rank, world_size)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
model = DDP(model, device_ids=[rank])
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
for epoch in range(3):
for batch in dataloader:
outputs = model(**batch)
loss = outputs.loss
loss.backward() # Gradients synced by DDP
optimizer.step()
optimizer.zero_grad()
DeepSpeed ZeRO
Partitions optimizer states, gradients, and optionally parameters across GPUs.
| ZeRO Stage | Partitions | Memory per GPU (7B, FP16) |
|---|---|---|
| Stage 1 | Optimizer states | ~24GB |
| Stage 2 | + Gradients | ~16GB |
| Stage 3 | + Parameters | ~8GB |
{
"bf16": { "enabled": true },
"zero_optimization": {
"stage": 3,
"offload_optimizer": { "device": "cpu", "pin_memory": true },
"offload_param": { "device": "cpu", "pin_memory": true },
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"gather_16bit_weights_on_model_save": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 100,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
Training Data Management
from datasets import Dataset, load_dataset
import hashlib
class TrainingDataManager:
def __init__(self, storage_path: str):
self.storage_path = storage_path
def prepare_dataset(self, raw_data: list[dict], tokenizer) -> Dataset:
dataset = Dataset.from_list(raw_data)
def tokenize_fn(examples):
return tokenizer(
examples["text"],
truncation=True,
padding="max_length",
max_length=2048,
return_overflowing_tokens=False
)
tokenized = dataset.map(tokenize_fn, batched=True, remove_columns=dataset.column_names)
return tokenized
def deduplicate(self, dataset: Dataset) -> Dataset:
seen_hashes = set()
unique_indices = []
for i, example in enumerate(dataset):
h = hashlib.md5(example["text"].encode()).hexdigest()
if h not in seen_hashes:
seen_hashes.add(h)
unique_indices.append(i)
return dataset.select(unique_indices)
def split_dataset(self, dataset: Dataset, train_ratio: float = 0.9) -> tuple:
split = dataset.train_test_split(test_size=1 - train_ratio, seed=42)
return split["train"], split["test"]
Hyperparameter Tuning
import optuna
def objective(trial):
lr = trial.suggest_float("lr", 1e-6, 1e-4, log=True)
batch_size = trial.suggest_categorical("batch_size", [4, 8, 16])
warmup_steps = trial.suggest_int("warmup_steps", 100, 1000)
lora_r = trial.suggest_categorical("lora_r", [8, 16, 32, 64])
# Configure model
lora_config = LoraConfig(r=lora_r, lora_alpha=2*lora_r, ...)
model = get_peft_model(base_model, lora_config)
# Train and evaluate
trainer = Trainer(
model=model,
args=TrainingArguments(
learning_rate=lr,
per_device_train_batch_size=batch_size,
warmup_steps=warmup_steps,
max_steps=1000,
evaluation_strategy="steps",
eval_steps=100,
),
train_dataset=train_dataset,
eval_dataset=val_dataset,
)
trainer.train()
return trainer.evaluate()["eval_loss"]
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)
Checkpoint Management
| Strategy | Description | When to Use |
|---|---|---|
| Periodic | Save every N steps | Long training runs |
| Best-model | Save based on eval loss | Resource-constrained |
| Last-K | Keep only last K checkpoints | Storage limited |
| Full recovery | Save optimizer + scheduler state | Resumable training |
Effective fine-tuning operations require careful infrastructure planning, distributed training expertise, and systematic experiment tracking.