---
name: pytorch
description: Building and training neural networks with PyTorch. Use when implementing deep learning models, training loops, data pipelines, model optimization with torch.compile, distributed training, or deploying PyTorch models.
---

# Using PyTorch

PyTorch is a deep learning framework with dynamic computation graphs, strong GPU acceleration, and Pythonic design. This skill covers practical patterns for building production-quality neural networks.

## Table of Contents

- [Core Concepts](#core-concepts)
- [Model Architecture](#model-architecture)
- [Training Loop](#training-loop)
- [Data Loading](#data-loading)
- [Performance Optimization](#performance-optimization)
- [Distributed Training](#distributed-training)
- [Saving and Loading](#saving-and-loading)

## Core Concepts

### Tensors

```python
import torch

# Create tensors
x = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)
x = torch.zeros(3, 4)
x = torch.randn(3, 4)  # Normal distribution

# Device management
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
x = x.to(device)

# Operations (all return new tensors)
y = x + 1
y = x @ x.T  # Matrix multiplication
y = x.view(2, 6)  # Reshape
```

### Autograd

```python
# Enable gradient tracking
x = torch.randn(3, requires_grad=True)
y = x ** 2
loss = y.sum()

# Compute gradients
loss.backward()
print(x.grad)  # dy/dx

# Disable gradients for inference
with torch.no_grad():
    pred = model(x)

# Or use inference mode (more efficient)
with torch.inference_mode():
    pred = model(x)
```

## Model Architecture

### nn.Module Pattern

```python
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)
```

### Common Layers

```python
# Convolution
nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)

# Normalization
nn.BatchNorm2d(num_features)
nn.LayerNorm(normalized_shape)

# Attention
nn.MultiheadAttention(embed_dim, num_heads)

# Recurrent
nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
```

### Weight Initialization

```python
def init_weights(module):
    if isinstance(module, nn.Linear):
        nn.init.xavier_uniform_(module.weight)
        if module.bias is not None:
            nn.init.zeros_(module.bias)
    elif isinstance(module, nn.Embedding):
        nn.init.normal_(module.weight, std=0.02)

model.apply(init_weights)
```

## Training Loop

### Standard Pattern

```python
model = Model(input_dim, hidden_dim, output_dim).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=0.01)
criterion = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()

        # Optional: gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()

    scheduler.step()

    # Validation
    model.eval()
    with torch.no_grad():
        for batch in val_loader:
            # ... validation logic
```

### Mixed Precision Training

```python
from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()

for batch in train_loader:
    inputs, targets = batch
    inputs, targets = inputs.to(device), targets.to(device)

    optimizer.zero_grad()

    with autocast():
        outputs = model(inputs)
        loss = criterion(outputs, targets)

    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()
```

### Gradient Accumulation

```python
# Requires setup from Mixed Precision Training above:
# scaler = GradScaler(), model, criterion, optimizer, device

accumulation_steps = 4

for i, batch in enumerate(train_loader):
    inputs, targets = batch
    inputs, targets = inputs.to(device), targets.to(device)

    with autocast():
        outputs = model(inputs)
        loss = criterion(outputs, targets) / accumulation_steps

    scaler.scale(loss).backward()

    if (i + 1) % accumulation_steps == 0:
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
```

## Data Loading

### Dataset and DataLoader

```python
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, data, labels, transform=None):
        self.data = data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx]
        if self.transform:
            x = self.transform(x)
        return x, self.labels[idx]

train_loader = DataLoader(
    dataset,
    batch_size=32,
    shuffle=True,
    num_workers=4,
    pin_memory=True,  # Faster GPU transfer
    drop_last=True,   # Consistent batch sizes
)
```

### Collate Functions

```python
def collate_fn(batch):
    """Custom batching for variable-length sequences."""
    inputs, targets = zip(*batch)
    inputs = nn.utils.rnn.pad_sequence(inputs, batch_first=True)
    targets = torch.stack(targets)
    return inputs, targets

loader = DataLoader(dataset, collate_fn=collate_fn)
```

## Performance Optimization

### torch.compile (PyTorch 2.0+)

```python
# Basic compilation
model = torch.compile(model)

# With options
model = torch.compile(
    model,
    mode="reduce-overhead",  # Options: default, reduce-overhead, max-autotune
    fullgraph=True,          # Enforce no graph breaks
)

# Compile specific functions
@torch.compile
def train_step(model, inputs, targets):
    outputs = model(inputs)
    return criterion(outputs, targets)
```

**Compilation modes:**
- `default`: Good balance of compile time and speedup
- `reduce-overhead`: Minimizes framework overhead, good for small models
- `max-autotune`: Maximum performance, longer compile time

### Memory Optimization

```python
# Activation checkpointing (trade compute for memory)
from torch.utils.checkpoint import checkpoint

class Model(nn.Module):
    def forward(self, x):
        # Recompute activations during backward
        x = checkpoint(self.expensive_layer, x, use_reentrant=False)
        return self.output_layer(x)

# Clear cache
torch.cuda.empty_cache()

# Monitor memory
print(torch.cuda.memory_allocated() / 1e9, "GB")
print(torch.cuda.max_memory_allocated() / 1e9, "GB")
```

## Distributed Training

### DistributedDataParallel (DDP)

```python
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler

def setup(rank, world_size):
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

def cleanup():
    dist.destroy_process_group()

def train(rank, world_size):
    setup(rank, world_size)

    model = Model().to(rank)
    model = DDP(model, device_ids=[rank])

    sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank)
    loader = DataLoader(dataset, sampler=sampler)

    for epoch in range(num_epochs):
        sampler.set_epoch(epoch)  # Important for shuffling
        # ... training loop

    cleanup()

# Launch with: torchrun --nproc_per_node=4 train.py
```

### FullyShardedDataParallel (FSDP)

```python
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from torch.distributed.fsdp import MixedPrecision

mp_policy = MixedPrecision(
    param_dtype=torch.bfloat16,
    reduce_dtype=torch.bfloat16,
    buffer_dtype=torch.bfloat16,
)

model = FSDP(
    model,
    mixed_precision=mp_policy,
    use_orig_params=True,  # Required for torch.compile compatibility
)
```

## Saving and Loading

### Checkpoints

```python
# Save
torch.save({
    "epoch": epoch,
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    "loss": loss,
}, "checkpoint.pt")

# Load
checkpoint = torch.load("checkpoint.pt", map_location=device)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
```

### Export for Deployment

```python
# TorchScript
scripted = torch.jit.script(model)
scripted.save("model.pt")

# ONNX
torch.onnx.export(
    model,
    dummy_input,
    "model.onnx",
    input_names=["input"],
    output_names=["output"],
    dynamic_axes={"input": {0: "batch"}, "output": {0: "batch"}},
)
```

## Best Practices

1. **Always set model mode**: Use `model.train()` and `model.eval()` appropriately
2. **Use inference_mode over no_grad**: More efficient for inference
3. **Pin memory for GPU training**: Set `pin_memory=True` in DataLoader
4. **Profile before optimizing**: Use `torch.profiler` to find bottlenecks
5. **Prefer bfloat16 over float16**: Better numerical stability on modern GPUs
6. **Use torch.compile**: Significant speedups with minimal code changes
7. **Set deterministic mode for reproducibility**:
   ```python
   torch.manual_seed(42)
   torch.backends.cudnn.deterministic = True
   torch.backends.cudnn.benchmark = False
   ```

## References

See `reference/` for detailed documentation:
- `training-patterns.md` - Advanced training techniques
- `debugging.md` - Debugging and profiling tools