LLM Fine-Tuning Operations

Fine-Tuning in LLMOps

Fine-tuning adapts pre-trained LLMs to specific tasks or domains. It requires specialized infrastructure for handling billion-parameter models and training data management.

Fine-Tuning Methods

Full Fine-Tuning

Updates all model parameters. Requires significant GPU memory and compute.

DfFull Fine-Tuning Memory

For a model with P parameters in FP16:

\text{Memory}_{full} = 2P \text{ (weights)} + 2P \text{ (gradients)} + 6P \text{ (Adam states)} = 10P \text{ bytes}

For a 7B parameter model: ~70GB GPU memory required.

LoRA (Low-Rank Adaptation)

Freezes original weights and trains low-rank decomposition matrices, reducing trainable parameters dramatically.

W' = W + \Delta W = W + BA

Where B ∈ ℝ^{d×r} and A ∈ ℝ^{r×k}, with rank r ≪ min(d, k).

from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,                    # Rank of decomposition
    lora_alpha=32,           # Scaling factor
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    bias="none",
    modules_to_save=["lm_head"]  # Train these fully
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()
# trainable params: 13,631,488 || all params: 6,744,647,680 || trainable%: 0.20%

QLoRA

Combines quantization with LoRA, enabling fine-tuning on a single GPU.

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model

# 4-bit quantized base model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    quantization_config=bnb_config,
    device_map="auto"
)

lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
# Total memory: ~10GB (vs ~70GB for full fine-tuning)

Distributed Training

Data Parallelism (DDP)

Each GPU holds a complete model copy; gradients are synchronized across GPUs.

import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP

def setup_ddp(rank, world_size):
    dist.init_process_group(
        backend="nccl",
        init_method="env://",
        world_size=world_size,
        rank=rank
    )
    torch.cuda.set_device(rank)

def train_ddp(rank, world_size):
    setup_ddp(rank, world_size)
    model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
    model = DDP(model, device_ids=[rank])

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

    for epoch in range(3):
        for batch in dataloader:
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()  # Gradients synced by DDP
            optimizer.step()
            optimizer.zero_grad()

DeepSpeed ZeRO

Partitions optimizer states, gradients, and optionally parameters across GPUs.

ZeRO Stage	Partitions	Memory per GPU (7B, FP16)
Stage 1	Optimizer states	~24GB
Stage 2	+ Gradients	~16GB
Stage 3	+ Parameters	~8GB

{
    "bf16": { "enabled": true },
    "zero_optimization": {
        "stage": 3,
        "offload_optimizer": { "device": "cpu", "pin_memory": true },
        "offload_param": { "device": "cpu", "pin_memory": true },
        "overlap_comm": true,
        "contiguous_gradients": true,
        "sub_group_size": 1e9,
        "reduce_bucket_size": "auto",
        "stage3_prefetch_bucket_size": "auto",
        "stage3_param_persistence_threshold": "auto",
        "stage3_max_live_parameters": 1e9,
        "stage3_max_reuse_distance": 1e9,
        "gather_16bit_weights_on_model_save": true
    },
    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 100,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}

Training Data Management

from datasets import Dataset, load_dataset
import hashlib

class TrainingDataManager:
    def __init__(self, storage_path: str):
        self.storage_path = storage_path

    def prepare_dataset(self, raw_data: list[dict], tokenizer) -> Dataset:
        dataset = Dataset.from_list(raw_data)

        def tokenize_fn(examples):
            return tokenizer(
                examples["text"],
                truncation=True,
                padding="max_length",
                max_length=2048,
                return_overflowing_tokens=False
            )

        tokenized = dataset.map(tokenize_fn, batched=True, remove_columns=dataset.column_names)
        return tokenized

    def deduplicate(self, dataset: Dataset) -> Dataset:
        seen_hashes = set()
        unique_indices = []
        for i, example in enumerate(dataset):
            h = hashlib.md5(example["text"].encode()).hexdigest()
            if h not in seen_hashes:
                seen_hashes.add(h)
                unique_indices.append(i)
        return dataset.select(unique_indices)

    def split_dataset(self, dataset: Dataset, train_ratio: float = 0.9) -> tuple:
        split = dataset.train_test_split(test_size=1 - train_ratio, seed=42)
        return split["train"], split["test"]

Hyperparameter Tuning

import optuna

def objective(trial):
    lr = trial.suggest_float("lr", 1e-6, 1e-4, log=True)
    batch_size = trial.suggest_categorical("batch_size", [4, 8, 16])
    warmup_steps = trial.suggest_int("warmup_steps", 100, 1000)
    lora_r = trial.suggest_categorical("lora_r", [8, 16, 32, 64])

    # Configure model
    lora_config = LoraConfig(r=lora_r, lora_alpha=2*lora_r, ...)
    model = get_peft_model(base_model, lora_config)

    # Train and evaluate
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            learning_rate=lr,
            per_device_train_batch_size=batch_size,
            warmup_steps=warmup_steps,
            max_steps=1000,
            evaluation_strategy="steps",
            eval_steps=100,
        ),
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )
    trainer.train()
    return trainer.evaluate()["eval_loss"]

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

Checkpoint Management

Strategy	Description	When to Use
Periodic	Save every N steps	Long training runs
Best-model	Save based on eval loss	Resource-constrained
Last-K	Keep only last K checkpoints	Storage limited
Full recovery	Save optimizer + scheduler state	Resumable training

Effective fine-tuning operations require careful infrastructure planning, distributed training expertise, and systematic experiment tracking.