{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Tracking memory leaks / memory fragmentation\n", "\n", "This notebook is for testing that the peak memory consumption is efficient and doesn't necessarily require more GPU RAM than needed.\n", "\n", "The detection comes from reading the output of [IPyExperimentsPytorch](https://github.com/stas00/ipyexperiments/) per-cell reports and `fastai.callbacks.mem.PeakMemMetric` metric." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from fastai.vision import *\n", "from fastai.utils.mem import *\n", "from fastai.callbacks.mem import *" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#! pip install ipyexperiments\n", "from ipyexperiments import IPyExperimentsPytorch\n", "from ipyexperiments.utils.mem import *" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", "assert str(device) == 'cuda:0', f\"we want GPU, got {device}\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from IPython.display import Markdown, display\n", "def alert(string, color='red'):\n", " display(Markdown(f\"**{string}**\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Prep dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "*** Experiment started with the Pytorch backend\n", "Device: ID 0, GeForce GTX 1070 Ti (8119 RAM)\n", "\n", "\n", "*** Current state:\n", "RAM: Used Free Total Util\n", "CPU: 2275 18518 31588 MB 7.20% \n", "GPU: 503 7616 8119 MB 6.19% \n", "\n", "\n", "・ RAM: △Consumed △Peaked Used Total | Exec time 0:00:00.000\n", "・ CPU: 0 0 2275 MB |\n", "・ GPU: 0 0 503 MB |\n" ] } ], "source": [ "exp1 = IPyExperimentsPytorch()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "・ RAM: △Consumed △Peaked Used Total | Exec time 0:00:00.003\n", "・ CPU: 0 1 2277 MB |\n", "・ GPU: 0 0 503 MB |\n" ] } ], "source": [ "path = untar_data(URLs.MNIST)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ImageDataBunch;\n", "\n", "Train: LabelList\n", "y: CategoryList (60000 items)\n", "[Category 4, Category 4, Category 4, Category 4, Category 4]...\n", "Path: /home/stas/.fastai/data/mnist_png\n", "x: ImageItemList (60000 items)\n", "[Image (1, 28, 28), Image (1, 28, 28), Image (1, 28, 28), Image (1, 28, 28), Image (1, 28, 28)]...\n", "Path: /home/stas/.fastai/data/mnist_png;\n", "\n", "Valid: LabelList\n", "y: CategoryList (10000 items)\n", "[Category 4, Category 4, Category 4, Category 4, Category 4]...\n", "Path: /home/stas/.fastai/data/mnist_png\n", "x: ImageItemList (10000 items)\n", "[Image (1, 28, 28), Image (1, 28, 28), Image (1, 28, 28), Image (1, 28, 28), Image (1, 28, 28)]...\n", "Path: /home/stas/.fastai/data/mnist_png;\n", "\n", "Test: None" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "・ RAM: △Consumed △Peaked Used Total | Exec time 0:00:00.837\n", "・ CPU: 34 3 2359 MB |\n", "・ GPU: 0 0 503 MB |\n" ] } ], "source": [ "# setup\n", "defaults.cmap='binary'\n", "\n", "tfms = ([*rand_pad(padding=3, size=28, mode='zeros')], [])\n", "num_workers=0\n", "#bs=512\n", "bs=128\n", "data = (ImageItemList.from_folder(path, convert_mode='L')\n", " .split_by_folder(train='training', valid='testing')\n", " .label_from_folder()\n", " .transform(tfms)\n", " .databunch(bs=bs, num_workers=num_workers)\n", " .normalize(imagenet_stats)\n", " ) \n", "data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Train and Validate\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "・ RAM: △Consumed △Peaked Used Total | Exec time 0:00:00.000\n", "・ CPU: 0 0 2359 MB |\n", "・ GPU: 0 0 503 MB |\n" ] } ], "source": [ "#arch=\"resnet34\"\n", "arch=\"resnet50\"\n", "model = getattr(models, arch) # models.resnetXX" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "・ RAM: △Consumed △Peaked Used Total | Exec time 0:00:01.360\n", "・ CPU: 0 0 2518 MB |\n", "・ GPU: 106 0 609 MB |\n" ] } ], "source": [ "learn = create_cnn(data, model, metrics=[accuracy], callback_fns=PeakMemMetric)\n", "#learn.opt_func\n", "#learn.opt_func = partial(optim.SGD, momentum=0.9)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "・ RAM: △Consumed △Peaked Used Total | Exec time 0:00:02.562\n", "・ CPU: 0 0 2519 MB |\n", "・ GPU: 7210 0 7819 MB |\n" ] } ], "source": [ "# must leave at least the size of the 2nd epoch peak\n", "# with resnet50\n", "# - with bs=128 it's about 300MB\n", "# - with bs=512 it's about 900MB\n", "x=gpu_mem_leave_free_mbs(300)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Total time: 01:32
| epoch | \n", "train_loss | \n", "valid_loss | \n", "accuracy | \n", "cpu used | \n", "peak | \n", "gpu used | \n", "peak | \n", "
|---|---|---|---|---|---|---|---|
| 1 | \n", "0.128634 | \n", "0.064438 | \n", "0.981300 | \n", "17 | \n", "17 | \n", "46 | \n", "294 | \n", "
| 2 | \n", "0.047700 | \n", "0.023343 | \n", "0.991700 | \n", "5 | \n", "6 | \n", "0 | \n", "226 | \n", "