--- name: deep-learning description: PyTorch, TensorFlow, neural networks, CNNs, transformers, and deep learning for production sasmp_version: "1.3.0" bonded_agent: 06-ml-ai-engineer bond_type: PRIMARY_BOND skill_version: "2.0.0" last_updated: "2025-01" complexity: advanced estimated_mastery_hours: 200 prerequisites: [python-programming, machine-learning] unlocks: [llms-generative-ai, mlops] --- # Deep Learning Production-grade deep learning with PyTorch, neural network architectures, and modern training practices. ## Quick Start ```python # PyTorch Production Training Loop import torch import torch.nn as nn from torch.utils.data import DataLoader from torch.optim import AdamW from torch.optim.lr_scheduler import CosineAnnealingLR import wandb class TransformerClassifier(nn.Module): def __init__(self, vocab_size: int, d_model: int = 256, n_heads: int = 8, n_classes: int = 2): super().__init__() self.embedding = nn.Embedding(vocab_size, d_model) self.pos_encoding = nn.Parameter(torch.randn(1, 512, d_model)) encoder_layer = nn.TransformerEncoderLayer(d_model, n_heads, dim_feedforward=1024, batch_first=True) self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=6) self.classifier = nn.Linear(d_model, n_classes) self.dropout = nn.Dropout(0.1) def forward(self, x, mask=None): x = self.embedding(x) + self.pos_encoding[:, :x.size(1), :] x = self.dropout(x) x = self.transformer(x, src_key_padding_mask=mask) x = x.mean(dim=1) # Global average pooling return self.classifier(x) # Training configuration device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = TransformerClassifier(vocab_size=30000).to(device) optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.01) scheduler = CosineAnnealingLR(optimizer, T_max=10) criterion = nn.CrossEntropyLoss() # Training loop with mixed precision scaler = torch.cuda.amp.GradScaler() for epoch in range(10): model.train() for batch in train_loader: optimizer.zero_grad() with torch.cuda.amp.autocast(): logits = model(batch["input_ids"].to(device)) loss = criterion(logits, batch["labels"].to(device)) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() scheduler.step() ``` ## Core Concepts ### 1. Modern Neural Network Architectures ```python import torch import torch.nn as nn import torch.nn.functional as F class ResidualBlock(nn.Module): """Residual block with skip connection.""" def __init__(self, channels: int): super().__init__() self.conv1 = nn.Conv2d(channels, channels, 3, padding=1) self.bn1 = nn.BatchNorm2d(channels) self.conv2 = nn.Conv2d(channels, channels, 3, padding=1) self.bn2 = nn.BatchNorm2d(channels) def forward(self, x): residual = x x = F.relu(self.bn1(self.conv1(x))) x = self.bn2(self.conv2(x)) return F.relu(x + residual) class AttentionBlock(nn.Module): """Multi-head self-attention.""" def __init__(self, d_model: int, n_heads: int = 8): super().__init__() self.attention = nn.MultiheadAttention(d_model, n_heads, batch_first=True) self.norm = nn.LayerNorm(d_model) self.ffn = nn.Sequential( nn.Linear(d_model, d_model * 4), nn.GELU(), nn.Linear(d_model * 4, d_model) ) self.norm2 = nn.LayerNorm(d_model) def forward(self, x, mask=None): attn_out, _ = self.attention(x, x, x, attn_mask=mask) x = self.norm(x + attn_out) return self.norm2(x + self.ffn(x)) ``` ### 2. Training Best Practices ```python from torch.utils.data import DataLoader from torch.optim.lr_scheduler import OneCycleLR # Gradient clipping and accumulation def train_epoch(model, loader, optimizer, accumulation_steps=4): model.train() optimizer.zero_grad() for i, batch in enumerate(loader): with torch.cuda.amp.autocast(): loss = model(batch) / accumulation_steps scaler.scale(loss).backward() if (i + 1) % accumulation_steps == 0: scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) scaler.step(optimizer) scaler.update() optimizer.zero_grad() # Early stopping class EarlyStopping: def __init__(self, patience: int = 5, min_delta: float = 0.001): self.patience = patience self.min_delta = min_delta self.counter = 0 self.best_loss = float('inf') def __call__(self, val_loss: float) -> bool: if val_loss < self.best_loss - self.min_delta: self.best_loss = val_loss self.counter = 0 else: self.counter += 1 return self.counter >= self.patience # Learning rate finder def find_lr(model, loader, optimizer, start_lr=1e-7, end_lr=10, num_iter=100): lrs, losses = [], [] lr_mult = (end_lr / start_lr) ** (1 / num_iter) for i, batch in enumerate(loader): if i >= num_iter: break lr = start_lr * (lr_mult ** i) for pg in optimizer.param_groups: pg['lr'] = lr loss = train_step(model, batch, optimizer) lrs.append(lr) losses.append(loss) return lrs, losses ``` ### 3. Model Deployment ```python import torch.onnx import onnxruntime as ort # Export to ONNX def export_to_onnx(model, sample_input, path="model.onnx"): model.eval() torch.onnx.export( model, sample_input, path, export_params=True, opset_version=17, do_constant_folding=True, input_names=['input'], output_names=['output'], dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}} ) # ONNX Runtime inference class ONNXPredictor: def __init__(self, model_path: str): self.session = ort.InferenceSession(model_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider']) def predict(self, input_data): return self.session.run(None, {'input': input_data})[0] # TorchScript for production scripted_model = torch.jit.script(model) scripted_model.save("model_scripted.pt") ``` ## Tools & Technologies | Tool | Purpose | Version (2025) | |------|---------|----------------| | **PyTorch** | Deep learning framework | 2.2+ | | **PyTorch Lightning** | Training framework | 2.2+ | | **Hugging Face** | Transformers, datasets | 4.38+ | | **ONNX Runtime** | Model inference | 1.17+ | | **TensorRT** | GPU optimization | 8.6+ | | **Weights & Biases** | Experiment tracking | Latest | | **Ray** | Distributed training | 2.9+ | ## Troubleshooting Guide | Issue | Symptoms | Root Cause | Fix | |-------|----------|------------|-----| | **Vanishing Gradient** | Loss not decreasing | Deep network, wrong activation | Use ReLU/GELU, residual connections | | **Exploding Gradient** | NaN loss | Learning rate too high | Gradient clipping, lower LR | | **Overfitting** | Train >> Val accuracy | Model too complex | Dropout, regularization, data aug | | **OOM Error** | CUDA out of memory | Batch too large | Reduce batch, gradient accumulation | | **Slow Training** | Low GPU utilization | Data loading bottleneck | More workers, prefetch | ### Debug Commands ```python # Check GPU memory print(torch.cuda.memory_summary()) # Profile training with torch.profiler.profile( activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA] ) as prof: train_step(model, batch, optimizer) print(prof.key_averages().table(sort_by="cuda_time_total")) # Gradient flow check for name, param in model.named_parameters(): if param.grad is not None: print(f"{name}: grad_mean={param.grad.mean():.6f}") ``` ## Best Practices ```python # ✅ DO: Use mixed precision training with torch.cuda.amp.autocast(): output = model(input) # ✅ DO: Initialize weights properly def init_weights(m): if isinstance(m, nn.Linear): nn.init.xavier_uniform_(m.weight) # ✅ DO: Use gradient checkpointing for large models from torch.utils.checkpoint import checkpoint x = checkpoint(self.layer, x) # ✅ DO: Freeze base model for fine-tuning for param in model.base.parameters(): param.requires_grad = False # ❌ DON'T: Use dropout during inference model.eval() # ❌ DON'T: Forget to move data to device ``` ## Resources - [PyTorch Tutorials](https://pytorch.org/tutorials/) - [Hugging Face Course](https://huggingface.co/learn) - [Fast.ai](https://course.fast.ai/) - "Deep Learning" by Goodfellow et al. --- **Skill Certification Checklist:** - [ ] Can build and train neural networks in PyTorch - [ ] Can implement attention mechanisms and transformers - [ ] Can use mixed precision and gradient accumulation - [ ] Can export models to ONNX/TorchScript - [ ] Can debug training issues (gradients, memory)