import marimo __generated_with = "0.23.8" app = marimo.App() @app.cell def _(): import marimo as mo import torch # Use the best available device: CUDA (NVIDIA) > MPS (Apple Silicon) > CPU. device = ( "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" ) return device, mo, torch @app.cell(hide_code=True) def _(mo): mo.md( r""" # 6. Performance & hardware A practical guide to running Py-Feat fast: choosing the right device, batching, and a few defaults that matter more than they look. """ ) return @app.cell(hide_code=True) def _(mo): mo.md( r""" ## 6.1 Pick a device Detectors run on CPU by default. Pass `device=...` to use a GPU. The portable pattern selects **CUDA** (NVIDIA) → **MPS** (Apple Silicon) → **CPU**, so the same notebook runs anywhere: ```python import torch device = ( "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" ) detector = Detectorv2(device=device) ``` """ ) return @app.cell def _(device): print(f"Selected device: {device!r}") return @app.cell(hide_code=True) def _(mo): mo.md( r""" ## 6.2 Time a detection We warm up once (first call loads weights / compiles kernels) and time the second call: """ ) return @app.cell def _(device): import os import time from feat import Detectorv2 from feat.utils.io import get_test_data_path detector = Detectorv2(device=device, identity_model="arcface") img_path = os.path.join(get_test_data_path(), "single_face.jpg") detector.detect(img_path, data_type="image") # warmup _t0 = time.perf_counter() detector.detect(img_path, data_type="image") print(f"single-image detect: {time.perf_counter() - _t0:.3f}s on {device}") return detector, get_test_data_path, os, time @app.cell(hide_code=True) def _(mo): mo.md( r""" ## 6.3 Batch images and video Batching is the single biggest lever on GPU throughput. Processing inputs one at a time leaves the GPU idle between calls; passing `batch_size > 1` stacks inputs into one tensor so the network runs them in parallel. The sweep below shows throughput (images/second) climbing with batch size — **pick the largest batch that fits in VRAM** (drop it back down if you hit an out-of-memory error). - **Images:** `detector.detect(img_list, batch_size=8)`. All images in a batch must share dimensions; pass `output_size=(H, W)` to pad/resize mismatched images so they stack. - **Video:** `detector.detect(video, data_type="video", batch_size=8)`. Add `skip_frames=N` to process every *N*-th frame when you don't need every frame — see the *Detecting Videos* tutorial for a full example. """ ) return @app.cell def _(detector, get_test_data_path, os, time): multi = os.path.join(get_test_data_path(), "multi_face.jpg") img_list = [multi] * 8 for _bs in (1, 2, 4, 8): detector.detect(img_list, batch_size=_bs, data_type="image") # warmup _t0 = time.perf_counter() detector.detect(img_list, batch_size=_bs, data_type="image") _dt = time.perf_counter() - _t0 print(f"8 images, batch_size={_bs}: {_dt:.3f}s ({8 / _dt:.1f} img/s)") return (img_list,) @app.cell(hide_code=True) def _(mo): mo.md( r""" ## 6.4 Pin memory for faster CUDA transfers Every batch is copied from host (CPU) RAM into GPU memory before the network runs. On **CUDA**, passing `pin_memory=True` allocates that batch in *page-locked* host memory, which lets the copy overlap with computation — Py-Feat already issues the host→device transfer with `non_blocking=True`, so the two halves pair up for a small, free win on GPU-bound batches. `pin_memory` has no effect on **MPS** or **CPU** (there's no pinned-memory fast path), so only set it when `device="cuda"`. """ ) return @app.cell def _(detector, device, img_list, time): if device == "cuda": for _pin in (False, True): detector.detect( img_list, batch_size=8, pin_memory=_pin, data_type="image" ) # warmup _t0 = time.perf_counter() detector.detect( img_list, batch_size=8, pin_memory=_pin, data_type="image" ) print(f"8 images, batch_size=8, pin_memory={_pin}: {time.perf_counter() - _t0:.3f}s") else: print(f"pin_memory only affects CUDA; current device is {device!r} — skipping.") return @app.cell(hide_code=True) def _(mo): mo.md( r""" ## 6.5 Leave `num_workers=0` `detect()` accepts a DataLoader `num_workers` argument. **Keep the default `num_workers=0`.** On Apple Silicon + Python 3.13 with Py-Feat's `OMP_NUM_THREADS=1` default, `num_workers > 0` is consistently *slower* (worst case ~33× for image batches) because worker processes contend for the same single-threaded BLAS/OMP pool. If a DataLoader feels slow, this is usually why. ## 6.6 Large datasets - Pass `save="out.csv"` to write results incrementally instead of holding every frame in memory. - Use `skip_frames` on long videos. - Reuse one detector instance across many files — model weights load once. """ ) return if __name__ == "__main__": app.run()