Advanced Computer Vision
Computer vision powers autonomous vehicles, medical imaging, and quality inspection. Learn modern detection, segmentation, and transformer-based approaches.
Object Detection Pipeline
YOLO-style Detection
import torch
import torch.nn as nn
class ConvBlock(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1):
super().__init__()
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False)
self.bn = nn.BatchNorm2d(out_channels)
self.act = nn.SiLU()
def forward(self, x):
return self.act(self.bn(self.conv(x)))
class DetectionHead(nn.Module):
def __init__(self, in_channels, num_classes, num_anchors=3):
super().__init__()
self.num_anchors = num_anchors
self.num_classes = num_classes
self.conv = nn.Sequential(
ConvBlock(in_channels, 256),
ConvBlock(256, 512),
nn.Conv2d(512, num_anchors * (5 + num_classes), 1)
)
def forward(self, x):
batch_size, _, grid_h, grid_w = x.size()
out = self.conv(x)
# Reshape to (batch, num_anchors, grid_h, grid_w, 5 + num_classes)
out = out.view(batch_size, self.num_anchors, 5 + self.num_classes, grid_h, grid_w)
out = out.permute(0, 1, 3, 4, 2).contiguous()
return out
class SimpleYOLO(nn.Module):
def __init__(self, num_classes=80):
super().__init__()
self.num_classes = num_classes
# Backbone (simplified CSPDarknet)
self.backbone = nn.Sequential(
ConvBlock(3, 32, 3, 1, 1),
self._make_stage(32, 64, 2),
self._make_stage(64, 128, 2),
self._make_stage(128, 256, 2),
self._make_stage(256, 512, 2)
)
# Detection heads at different scales
self.head_small = DetectionHead(512, num_classes)
self.head_medium = DetectionHead(256, num_classes)
self.head_large = DetectionHead(128, num_classes)
def _make_stage(self, in_ch, out_ch, num_blocks):
layers = [ConvBlock(in_ch, out_ch, 3, 2, 1)]
for _ in range(num_blocks):
layers.append(ConvBlock(out_ch, out_ch))
return nn.Sequential(*layers)
def forward(self, x):
features = self.backbone(x)
# Multi-scale predictions
pred_small = self.head_small(features)
pred_medium = self.head_medium(features)
pred_large = self.head_large(features)
return [pred_small, pred_medium, pred_large]
# Non-maximum suppression for post-processing
def non_max_suppression(predictions, conf_threshold=0.5, iou_threshold=0.45):
"""Apply NMS to remove duplicate detections"""
boxes = predictions[predictions[:, 4] > conf_threshold]
if len(boxes) == 0:
return []
# Extract coordinates
x1 = boxes[:, 0] - boxes[:, 2] / 2
y1 = boxes[:, 1] - boxes[:, 3] / 2
x2 = boxes[:, 0] + boxes[:, 2] / 2
y2 = boxes[:, 1] + boxes[:, 3] / 2
areas = (x2 - x1) * (y2 - y1)
scores = boxes[:, 4]
# Sort by confidence
order = scores.argsort()[::-1]
keep = []
while len(order) > 0:
i = order[0]
keep.append(i)
xx1 = torch.max(x1[i], x1[order[1:]])
yy1 = torch.max(y1[i], y1[order[1:]])
xx2 = torch.min(x2[i], x2[order[1:]])
yy2 = torch.min(y2[i], y2[order[1:]])
w = torch.clamp(xx2 - xx1, min=0)
h = torch.clamp(yy2 - yy1, min=0)
intersection = w * h
iou = intersection / (areas[i] + areas[order[1:]] - intersection)
# Keep boxes with IoU less than threshold
remaining = torch.where(iou <= iou_threshold)[0]
order = order[remaining + 1]
return keep
Image Segmentation
import torch
import torch.nn as nn
import torch.nn.functional as F
class UNet(nn.Module):
"""U-Net for semantic segmentation"""
def __init__(self, in_channels=3, num_classes=21):
super().__init__()
# Encoder
self.enc1 = self._conv_block(in_channels, 64)
self.enc2 = self._conv_block(64, 128)
self.enc3 = self._conv_block(128, 256)
self.enc4 = self._conv_block(256, 512)
self.pool = nn.MaxPool2d(2)
# Bottleneck
self.bottleneck = self._conv_block(512, 1024)
# Decoder
self.up4 = nn.ConvTranspose2d(1024, 512, 2, stride=2)
self.dec4 = self._conv_block(1024, 512)
self.up3 = nn.ConvTranspose2d(512, 256, 2, stride=2)
self.dec3 = self._conv_block(512, 256)
self.up2 = nn.ConvTranspose2d(256, 128, 2, stride=2)
self.dec2 = self._conv_block(256, 128)
self.up1 = nn.ConvTranspose2d(128, 64, 2, stride=2)
self.dec1 = self._conv_block(128, 64)
# Output
self.output = nn.Conv2d(64, num_classes, 1)
def _conv_block(self, in_ch, out_ch):
return nn.Sequential(
nn.Conv2d(in_ch, out_ch, 3, padding=1),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True),
nn.Conv2d(out_ch, out_ch, 3, padding=1),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True)
)
def _pad_to_match(self, x, target):
diff_y = target.size(2) - x.size(2)
diff_x = target.size(3) - x.size(3)
return F.pad(x, [diff_x // 2, diff_x - diff_x // 2,
diff_y // 2, diff_y - diff_y // 2])
def forward(self, x):
# Encoder
e1 = self.enc1(x)
e2 = self.enc2(self.pool(e1))
e3 = self.enc3(self.pool(e2))
e4 = self.enc4(self.pool(e3))
# Bottleneck
b = self.bottleneck(self.pool(e4))
# Decoder with skip connections
d4 = self.up4(b)
d4 = self._pad_to_match(d4, e4)
d4 = self.dec4(torch.cat([d4, e4], dim=1))
d3 = self.up3(d4)
d3 = self._pad_to_match(d3, e3)
d3 = self.dec3(torch.cat([d3, e3], dim=1))
d2 = self.up2(d3)
d2 = self._pad_to_match(d2, e2)
d2 = self.dec2(torch.cat([d2, e2], dim=1))
d1 = self.up1(d2)
d1 = self._pad_to_match(d1, e1)
d1 = self.dec1(torch.cat([d1, e1], dim=1))
return self.output(d1)
# Segmentation metrics
def dice_coefficient(pred, target, smooth=1e-5):
pred_flat = pred.view(-1)
target_flat = target.view(-1)
intersection = (pred_flat * target_flat).sum()
return (2. * intersection + smooth) / (pred_flat.sum() + target_flat.sum() + smooth)
def iou_score(pred, target, smooth=1e-5):
pred_flat = pred.view(-1)
target_flat = target.view(-1)
intersection = (pred_flat * target_flat).sum()
union = pred_flat.sum() + target_flat.sum() - intersection
return (intersection + smooth) / (union + smooth)
Data Augmentation
import torchvision.transforms as T
import numpy as np
from PIL import Image
class AdvancedAugmentation:
def __init__(self, image_size=640):
self.image_size = image_size
self.train_transform = T.Compose([
T.RandomResizedCrop(image_size, scale=(0.5, 1.0)),
T.RandomHorizontalFlip(p=0.5),
T.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1),
T.RandomGrayscale(p=0.2),
T.GaussianBlur(kernel_size=3, sigma=(0.1, 2.0)),
T.ToTensor(),
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
T.RandomErasing(p=0.25)
])
self.val_transform = T.Compose([
T.Resize(image_size + 32),
T.CenterCrop(image_size),
T.ToTensor(),
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
def __call__(self, image, is_training=True):
if is_training:
return self.train_transform(image)
return self.val_transform(image)
# CutMix and MixUp
class Mixup:
def __init__(self, alpha=1.0):
self.alpha = alpha
def __call__(self, images, targets):
lam = np.random.beta(self.alpha, self.alpha)
batch_size = images.size(0)
index = torch.randperm(batch_size)
mixed_images = lam * images + (1 - lam) * images[index]
targets_a, targets_b = targets, targets[index]
return mixed_images, targets_a, targets_b, lam
class CutMix:
def __init__(self, alpha=1.0):
self.alpha = alpha
def _rand_bbox(self, size, lam):
W, H = size[2], size[3]
cut_rat = np.sqrt(1. - lam)
cut_w = int(W * cut_rat)
cut_h = int(H * cut_rat)
cx = np.random.randint(W)
cy = np.random.randint(H)
x1 = np.clip(cx - cut_w // 2, 0, W)
y1 = np.clip(cy - cut_h // 2, 0, H)
x2 = np.clip(cx + cut_w // 2, 0, W)
y2 = np.clip(cy + cut_h // 2, 0, H)
return x1, y1, x2, y2
def __call__(self, images, targets):
lam = np.random.beta(self.alpha, self.alpha)
batch_size = images.size(0)
index = torch.randperm(batch_size)
x1, y1, x2, y2 = self._rand_bbox(images.size(), lam)
images[:, :, x1:x2, y1:y2] = images[index, :, x1:x2, y1:y2]
lam = 1 - ((x2 - x1) * (y2 - y1) / (images.size(-1) * images.size(-2)))
return images, targets, targets[index], lam
Vision Transformers (ViT)
import torch
import torch.nn as nn
class PatchEmbedding(nn.Module):
def __init__(self, img_size=224, patch_size=16, in_channels=3, embed_dim=768):
super().__init__()
self.num_patches = (img_size // patch_size) ** 2
self.proj = nn.Conv2d(in_channels, embed_dim,
kernel_size=patch_size, stride=patch_size)
self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
self.position_embedding = nn.Parameter(torch.randn(1, self.num_patches + 1, embed_dim))
def forward(self, x):
B = x.size(0)
x = self.proj(x) # (B, embed_dim, H/P, W/P)
x = x.flatten(2).transpose(1, 2) # (B, num_patches, embed_dim)
cls_tokens = self.cls_token.expand(B, -1, -1)
x = torch.cat([cls_tokens, x], dim=1)
x = x + self.position_embedding
return x
class VisionTransformer(nn.Module):
def __init__(self, img_size=224, patch_size=16, in_channels=3,
num_classes=1000, embed_dim=768, depth=12, num_heads=12):
super().__init__()
self.patch_embedding = PatchEmbedding(img_size, patch_size, in_channels, embed_dim)
encoder_layer = nn.TransformerEncoderLayer(
d_model=embed_dim,
nhead=num_heads,
dim_feedforward=embed_dim * 4,
dropout=0.1,
batch_first=True
)
self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=depth)
self.norm = nn.LayerNorm(embed_dim)
self.head = nn.Linear(embed_dim, num_classes)
def forward(self, x):
x = self.patch_embedding(x)
x = self.transformer(x)
x = self.norm(x)
# Use CLS token for classification
cls_token = x[:, 0]
return self.head(cls_token)
# Data-efficient ViT (DeiT) with distillation
class DeiT(VisionTransformer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.dist_token = nn.Parameter(torch.randn(1, 1, self.patch_embedding.embed_dim))
self.head_dist = nn.Linear(self.patch_embedding.embed_dim, self.num_classes)
def forward(self, x):
x = self.patch_embedding(x)
B = x.size(0)
dist_tokens = self.dist_token.expand(B, -1, -1)
x = torch.cat([x[:, :1], dist_tokens, x[:, 1:]], dim=1)
x = self.transformer(x)
x = self.norm(x)
cls_logits = self.head(x[:, 0])
dist_logits = self.head_dist(x[:, 1])
return cls_logits, dist_logits
Best Practices
- Use pre-trained backbones for transfer learning
- Multi-scale training improves robustness to object sizes
- Data augmentation is critical β mixup, cutmix, mosaic
- Test time augmentation (TTA) improves accuracy at inference cost
- NMS threshold tuning affects precision-recall tradeoff