---
name: deep-learning
description: PyTorch, TensorFlow, neural networks, CNNs, transformers, and deep learning for production
sasmp_version: "1.3.0"
bonded_agent: 06-ml-ai-engineer
bond_type: PRIMARY_BOND
skill_version: "2.0.0"
last_updated: "2025-01"
complexity: advanced
estimated_mastery_hours: 200
prerequisites: [python-programming, machine-learning]
unlocks: [llms-generative-ai, mlops]
---

# Deep Learning

Production-grade deep learning with PyTorch, neural network architectures, and modern training practices.

## Quick Start

```python
# PyTorch Production Training Loop
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
import wandb

class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size: int, d_model: int = 256, n_heads: int = 8, n_classes: int = 2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = nn.Parameter(torch.randn(1, 512, d_model))
        encoder_layer = nn.TransformerEncoderLayer(d_model, n_heads, dim_feedforward=1024, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=6)
        self.classifier = nn.Linear(d_model, n_classes)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x, mask=None):
        x = self.embedding(x) + self.pos_encoding[:, :x.size(1), :]
        x = self.dropout(x)
        x = self.transformer(x, src_key_padding_mask=mask)
        x = x.mean(dim=1)  # Global average pooling
        return self.classifier(x)

# Training configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerClassifier(vocab_size=30000).to(device)
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
scheduler = CosineAnnealingLR(optimizer, T_max=10)
criterion = nn.CrossEntropyLoss()

# Training loop with mixed precision
scaler = torch.cuda.amp.GradScaler()

for epoch in range(10):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            logits = model(batch["input_ids"].to(device))
            loss = criterion(logits, batch["labels"].to(device))
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
    scheduler.step()
```

## Core Concepts

### 1. Modern Neural Network Architectures

```python
import torch
import torch.nn as nn
import torch.nn.functional as F

class ResidualBlock(nn.Module):
    """Residual block with skip connection."""
    def __init__(self, channels: int):
        super().__init__()
        self.conv1 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(channels)
        self.conv2 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(channels)

    def forward(self, x):
        residual = x
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.bn2(self.conv2(x))
        return F.relu(x + residual)

class AttentionBlock(nn.Module):
    """Multi-head self-attention."""
    def __init__(self, d_model: int, n_heads: int = 8):
        super().__init__()
        self.attention = nn.MultiheadAttention(d_model, n_heads, batch_first=True)
        self.norm = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            nn.GELU(),
            nn.Linear(d_model * 4, d_model)
        )
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x, mask=None):
        attn_out, _ = self.attention(x, x, x, attn_mask=mask)
        x = self.norm(x + attn_out)
        return self.norm2(x + self.ffn(x))
```

### 2. Training Best Practices

```python
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import OneCycleLR

# Gradient clipping and accumulation
def train_epoch(model, loader, optimizer, accumulation_steps=4):
    model.train()
    optimizer.zero_grad()

    for i, batch in enumerate(loader):
        with torch.cuda.amp.autocast():
            loss = model(batch) / accumulation_steps

        scaler.scale(loss).backward()

        if (i + 1) % accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

# Early stopping
class EarlyStopping:
    def __init__(self, patience: int = 5, min_delta: float = 0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = float('inf')

    def __call__(self, val_loss: float) -> bool:
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience

# Learning rate finder
def find_lr(model, loader, optimizer, start_lr=1e-7, end_lr=10, num_iter=100):
    lrs, losses = [], []
    lr_mult = (end_lr / start_lr) ** (1 / num_iter)

    for i, batch in enumerate(loader):
        if i >= num_iter:
            break

        lr = start_lr * (lr_mult ** i)
        for pg in optimizer.param_groups:
            pg['lr'] = lr

        loss = train_step(model, batch, optimizer)
        lrs.append(lr)
        losses.append(loss)

    return lrs, losses
```

### 3. Model Deployment

```python
import torch.onnx
import onnxruntime as ort

# Export to ONNX
def export_to_onnx(model, sample_input, path="model.onnx"):
    model.eval()
    torch.onnx.export(
        model,
        sample_input,
        path,
        export_params=True,
        opset_version=17,
        do_constant_folding=True,
        input_names=['input'],
        output_names=['output'],
        dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
    )

# ONNX Runtime inference
class ONNXPredictor:
    def __init__(self, model_path: str):
        self.session = ort.InferenceSession(model_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])

    def predict(self, input_data):
        return self.session.run(None, {'input': input_data})[0]

# TorchScript for production
scripted_model = torch.jit.script(model)
scripted_model.save("model_scripted.pt")
```

## Tools & Technologies

| Tool | Purpose | Version (2025) |
|------|---------|----------------|
| **PyTorch** | Deep learning framework | 2.2+ |
| **PyTorch Lightning** | Training framework | 2.2+ |
| **Hugging Face** | Transformers, datasets | 4.38+ |
| **ONNX Runtime** | Model inference | 1.17+ |
| **TensorRT** | GPU optimization | 8.6+ |
| **Weights & Biases** | Experiment tracking | Latest |
| **Ray** | Distributed training | 2.9+ |

## Troubleshooting Guide

| Issue | Symptoms | Root Cause | Fix |
|-------|----------|------------|-----|
| **Vanishing Gradient** | Loss not decreasing | Deep network, wrong activation | Use ReLU/GELU, residual connections |
| **Exploding Gradient** | NaN loss | Learning rate too high | Gradient clipping, lower LR |
| **Overfitting** | Train >> Val accuracy | Model too complex | Dropout, regularization, data aug |
| **OOM Error** | CUDA out of memory | Batch too large | Reduce batch, gradient accumulation |
| **Slow Training** | Low GPU utilization | Data loading bottleneck | More workers, prefetch |

### Debug Commands

```python
# Check GPU memory
print(torch.cuda.memory_summary())

# Profile training
with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA]
) as prof:
    train_step(model, batch, optimizer)
print(prof.key_averages().table(sort_by="cuda_time_total"))

# Gradient flow check
for name, param in model.named_parameters():
    if param.grad is not None:
        print(f"{name}: grad_mean={param.grad.mean():.6f}")
```

## Best Practices

```python
# ✅ DO: Use mixed precision training
with torch.cuda.amp.autocast():
    output = model(input)

# ✅ DO: Initialize weights properly
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)

# ✅ DO: Use gradient checkpointing for large models
from torch.utils.checkpoint import checkpoint
x = checkpoint(self.layer, x)

# ✅ DO: Freeze base model for fine-tuning
for param in model.base.parameters():
    param.requires_grad = False

# ❌ DON'T: Use dropout during inference
model.eval()

# ❌ DON'T: Forget to move data to device
```

## Resources

- [PyTorch Tutorials](https://pytorch.org/tutorials/)
- [Hugging Face Course](https://huggingface.co/learn)
- [Fast.ai](https://course.fast.ai/)
- "Deep Learning" by Goodfellow et al.

---

**Skill Certification Checklist:**
- [ ] Can build and train neural networks in PyTorch
- [ ] Can implement attention mechanisms and transformers
- [ ] Can use mixed precision and gradient accumulation
- [ ] Can export models to ONNX/TorchScript
- [ ] Can debug training issues (gradients, memory)