Advanced Computer Vision

Computer vision powers autonomous vehicles, medical imaging, and quality inspection. Learn modern detection, segmentation, and transformer-based approaches.

Object Detection Pipeline

\text{IoU} = \frac{|B_{pred} \cap B_{gt}|}{|B_{pred} \cup B_{gt}|}

YOLO-style Detection

import torch
import torch.nn as nn

class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False)
        self.bn = nn.BatchNorm2d(out_channels)
        self.act = nn.SiLU()
    
    def forward(self, x):
        return self.act(self.bn(self.conv(x)))

class DetectionHead(nn.Module):
    def __init__(self, in_channels, num_classes, num_anchors=3):
        super().__init__()
        self.num_anchors = num_anchors
        self.num_classes = num_classes
        
        self.conv = nn.Sequential(
            ConvBlock(in_channels, 256),
            ConvBlock(256, 512),
            nn.Conv2d(512, num_anchors * (5 + num_classes), 1)
        )
    
    def forward(self, x):
        batch_size, _, grid_h, grid_w = x.size()
        out = self.conv(x)
        
        # Reshape to (batch, num_anchors, grid_h, grid_w, 5 + num_classes)
        out = out.view(batch_size, self.num_anchors, 5 + self.num_classes, grid_h, grid_w)
        out = out.permute(0, 1, 3, 4, 2).contiguous()
        
        return out

class SimpleYOLO(nn.Module):
    def __init__(self, num_classes=80):
        super().__init__()
        self.num_classes = num_classes
        
        # Backbone (simplified CSPDarknet)
        self.backbone = nn.Sequential(
            ConvBlock(3, 32, 3, 1, 1),
            self._make_stage(32, 64, 2),
            self._make_stage(64, 128, 2),
            self._make_stage(128, 256, 2),
            self._make_stage(256, 512, 2)
        )
        
        # Detection heads at different scales
        self.head_small = DetectionHead(512, num_classes)
        self.head_medium = DetectionHead(256, num_classes)
        self.head_large = DetectionHead(128, num_classes)
    
    def _make_stage(self, in_ch, out_ch, num_blocks):
        layers = [ConvBlock(in_ch, out_ch, 3, 2, 1)]
        for _ in range(num_blocks):
            layers.append(ConvBlock(out_ch, out_ch))
        return nn.Sequential(*layers)
    
    def forward(self, x):
        features = self.backbone(x)
        
        # Multi-scale predictions
        pred_small = self.head_small(features)
        pred_medium = self.head_medium(features)
        pred_large = self.head_large(features)
        
        return [pred_small, pred_medium, pred_large]

# Non-maximum suppression for post-processing
def non_max_suppression(predictions, conf_threshold=0.5, iou_threshold=0.45):
    """Apply NMS to remove duplicate detections"""
    boxes = predictions[predictions[:, 4] > conf_threshold]
    
    if len(boxes) == 0:
        return []
    
    # Extract coordinates
    x1 = boxes[:, 0] - boxes[:, 2] / 2
    y1 = boxes[:, 1] - boxes[:, 3] / 2
    x2 = boxes[:, 0] + boxes[:, 2] / 2
    y2 = boxes[:, 1] + boxes[:, 3] / 2
    
    areas = (x2 - x1) * (y2 - y1)
    scores = boxes[:, 4]
    
    # Sort by confidence
    order = scores.argsort()[::-1]
    keep = []
    
    while len(order) > 0:
        i = order[0]
        keep.append(i)
        
        xx1 = torch.max(x1[i], x1[order[1:]])
        yy1 = torch.max(y1[i], y1[order[1:]])
        xx2 = torch.min(x2[i], x2[order[1:]])
        yy2 = torch.min(y2[i], y2[order[1:]])
        
        w = torch.clamp(xx2 - xx1, min=0)
        h = torch.clamp(yy2 - yy1, min=0)
        
        intersection = w * h
        iou = intersection / (areas[i] + areas[order[1:]] - intersection)
        
        # Keep boxes with IoU less than threshold
        remaining = torch.where(iou <= iou_threshold)[0]
        order = order[remaining + 1]
    
    return keep

Image Segmentation

import torch
import torch.nn as nn
import torch.nn.functional as F

class UNet(nn.Module):
    """U-Net for semantic segmentation"""
    
    def __init__(self, in_channels=3, num_classes=21):
        super().__init__()
        
        # Encoder
        self.enc1 = self._conv_block(in_channels, 64)
        self.enc2 = self._conv_block(64, 128)
        self.enc3 = self._conv_block(128, 256)
        self.enc4 = self._conv_block(256, 512)
        
        self.pool = nn.MaxPool2d(2)
        
        # Bottleneck
        self.bottleneck = self._conv_block(512, 1024)
        
        # Decoder
        self.up4 = nn.ConvTranspose2d(1024, 512, 2, stride=2)
        self.dec4 = self._conv_block(1024, 512)
        
        self.up3 = nn.ConvTranspose2d(512, 256, 2, stride=2)
        self.dec3 = self._conv_block(512, 256)
        
        self.up2 = nn.ConvTranspose2d(256, 128, 2, stride=2)
        self.dec2 = self._conv_block(256, 128)
        
        self.up1 = nn.ConvTranspose2d(128, 64, 2, stride=2)
        self.dec1 = self._conv_block(128, 64)
        
        # Output
        self.output = nn.Conv2d(64, num_classes, 1)
    
    def _conv_block(self, in_ch, out_ch):
        return nn.Sequential(
            nn.Conv2d(in_ch, out_ch, 3, padding=1),
            nn.BatchNorm2d(out_ch),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_ch, out_ch, 3, padding=1),
            nn.BatchNorm2d(out_ch),
            nn.ReLU(inplace=True)
        )
    
    def _pad_to_match(self, x, target):
        diff_y = target.size(2) - x.size(2)
        diff_x = target.size(3) - x.size(3)
        return F.pad(x, [diff_x // 2, diff_x - diff_x // 2,
                         diff_y // 2, diff_y - diff_y // 2])
    
    def forward(self, x):
        # Encoder
        e1 = self.enc1(x)
        e2 = self.enc2(self.pool(e1))
        e3 = self.enc3(self.pool(e2))
        e4 = self.enc4(self.pool(e3))
        
        # Bottleneck
        b = self.bottleneck(self.pool(e4))
        
        # Decoder with skip connections
        d4 = self.up4(b)
        d4 = self._pad_to_match(d4, e4)
        d4 = self.dec4(torch.cat([d4, e4], dim=1))
        
        d3 = self.up3(d4)
        d3 = self._pad_to_match(d3, e3)
        d3 = self.dec3(torch.cat([d3, e3], dim=1))
        
        d2 = self.up2(d3)
        d2 = self._pad_to_match(d2, e2)
        d2 = self.dec2(torch.cat([d2, e2], dim=1))
        
        d1 = self.up1(d2)
        d1 = self._pad_to_match(d1, e1)
        d1 = self.dec1(torch.cat([d1, e1], dim=1))
        
        return self.output(d1)

# Segmentation metrics
def dice_coefficient(pred, target, smooth=1e-5):
    pred_flat = pred.view(-1)
    target_flat = target.view(-1)
    
    intersection = (pred_flat * target_flat).sum()
    return (2. * intersection + smooth) / (pred_flat.sum() + target_flat.sum() + smooth)

def iou_score(pred, target, smooth=1e-5):
    pred_flat = pred.view(-1)
    target_flat = target.view(-1)
    
    intersection = (pred_flat * target_flat).sum()
    union = pred_flat.sum() + target_flat.sum() - intersection
    
    return (intersection + smooth) / (union + smooth)

Data Augmentation

import torchvision.transforms as T
import numpy as np
from PIL import Image

class AdvancedAugmentation:
    def __init__(self, image_size=640):
        self.image_size = image_size
        
        self.train_transform = T.Compose([
            T.RandomResizedCrop(image_size, scale=(0.5, 1.0)),
            T.RandomHorizontalFlip(p=0.5),
            T.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1),
            T.RandomGrayscale(p=0.2),
            T.GaussianBlur(kernel_size=3, sigma=(0.1, 2.0)),
            T.ToTensor(),
            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            T.RandomErasing(p=0.25)
        ])
        
        self.val_transform = T.Compose([
            T.Resize(image_size + 32),
            T.CenterCrop(image_size),
            T.ToTensor(),
            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
    
    def __call__(self, image, is_training=True):
        if is_training:
            return self.train_transform(image)
        return self.val_transform(image)

# CutMix and MixUp
class Mixup:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
    
    def __call__(self, images, targets):
        lam = np.random.beta(self.alpha, self.alpha)
        batch_size = images.size(0)
        index = torch.randperm(batch_size)
        
        mixed_images = lam * images + (1 - lam) * images[index]
        targets_a, targets_b = targets, targets[index]
        
        return mixed_images, targets_a, targets_b, lam

class CutMix:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
    
    def _rand_bbox(self, size, lam):
        W, H = size[2], size[3]
        cut_rat = np.sqrt(1. - lam)
        cut_w = int(W * cut_rat)
        cut_h = int(H * cut_rat)
        
        cx = np.random.randint(W)
        cy = np.random.randint(H)
        
        x1 = np.clip(cx - cut_w // 2, 0, W)
        y1 = np.clip(cy - cut_h // 2, 0, H)
        x2 = np.clip(cx + cut_w // 2, 0, W)
        y2 = np.clip(cy + cut_h // 2, 0, H)
        
        return x1, y1, x2, y2
    
    def __call__(self, images, targets):
        lam = np.random.beta(self.alpha, self.alpha)
        batch_size = images.size(0)
        index = torch.randperm(batch_size)
        
        x1, y1, x2, y2 = self._rand_bbox(images.size(), lam)
        images[:, :, x1:x2, y1:y2] = images[index, :, x1:x2, y1:y2]
        
        lam = 1 - ((x2 - x1) * (y2 - y1) / (images.size(-1) * images.size(-2)))
        
        return images, targets, targets[index], lam

Vision Transformers (ViT)

import torch
import torch.nn as nn

class PatchEmbedding(nn.Module):
    def __init__(self, img_size=224, patch_size=16, in_channels=3, embed_dim=768):
        super().__init__()
        self.num_patches = (img_size // patch_size) ** 2
        
        self.proj = nn.Conv2d(in_channels, embed_dim, 
                              kernel_size=patch_size, stride=patch_size)
        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
        self.position_embedding = nn.Parameter(torch.randn(1, self.num_patches + 1, embed_dim))
    
    def forward(self, x):
        B = x.size(0)
        
        x = self.proj(x)  # (B, embed_dim, H/P, W/P)
        x = x.flatten(2).transpose(1, 2)  # (B, num_patches, embed_dim)
        
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat([cls_tokens, x], dim=1)
        
        x = x + self.position_embedding
        
        return x

class VisionTransformer(nn.Module):
    def __init__(self, img_size=224, patch_size=16, in_channels=3, 
                 num_classes=1000, embed_dim=768, depth=12, num_heads=12):
        super().__init__()
        
        self.patch_embedding = PatchEmbedding(img_size, patch_size, in_channels, embed_dim)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=embed_dim * 4,
            dropout=0.1,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=depth)
        
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)
    
    def forward(self, x):
        x = self.patch_embedding(x)
        x = self.transformer(x)
        x = self.norm(x)
        
        # Use CLS token for classification
        cls_token = x[:, 0]
        return self.head(cls_token)

# Data-efficient ViT (DeiT) with distillation
class DeiT(VisionTransformer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.dist_token = nn.Parameter(torch.randn(1, 1, self.patch_embedding.embed_dim))
        self.head_dist = nn.Linear(self.patch_embedding.embed_dim, self.num_classes)
    
    def forward(self, x):
        x = self.patch_embedding(x)
        
        B = x.size(0)
        dist_tokens = self.dist_token.expand(B, -1, -1)
        x = torch.cat([x[:, :1], dist_tokens, x[:, 1:]], dim=1)
        
        x = self.transformer(x)
        x = self.norm(x)
        
        cls_logits = self.head(x[:, 0])
        dist_logits = self.head_dist(x[:, 1])
        
        return cls_logits, dist_logits

Best Practices

Use pre-trained backbones for transfer learning
Multi-scale training improves robustness to object sizes
Data augmentation is critical – mixup, cutmix, mosaic
Test time augmentation (TTA) improves accuracy at inference cost
NMS threshold tuning affects precision-recall tradeoff