{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# fastai OOM memory recover\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from fastai import *\n", "from fastai.vision import *\n", "from ipyexperiments import IPyExperimentsPytorch" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", "assert str(device) == 'cuda:0', f\"we want GPU, got {device}\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([1.], device='cuda:0')" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "torch.ones(1).cuda() # preload" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "consuming 6614MB to bring free mem to 1000MBs\n" ] } ], "source": [ "import pynvml, torch, gc\n", "\n", "pynvml.nvmlInit()\n", "id = torch.cuda.current_device()\n", "def mem_free():\n", " gc.collect()\n", " torch.cuda.empty_cache()\n", " handle = pynvml.nvmlDeviceGetHandleByIndex(id)\n", " info = pynvml.nvmlDeviceGetMemoryInfo(handle)\n", " return int( info.free / 2**20 )\n", "\n", "def mem_report(): print(f\"free mem={mem_free()}\")\n", "\n", "def mem_allocate_mbs(n, fatal=False): \n", " \" allocate n MBs, return the var holding it on success, None on failure \"\n", " if n < 6: return None # don't try to allocate less than 6MB\n", " try:\n", " d = int(2**9*n**0.5)\n", " return torch.ones((d, d)).cuda().contiguous()\n", " except Exception as e:\n", " if not fatal: return None\n", " raise e\n", " \n", "def leave_free_mbs(n):\n", " \" consume whatever memory is needed so that n MBs are left free \"\n", " avail = mem_free()\n", " assert avail > n, f\"already have less available mem than desired {n}MBs\"\n", " consume = avail - n\n", " print(f\"consuming {consume}MB to bring free mem to {n}MBs\")\n", " return mem_allocate_mbs(consume, fatal=True)\n", "\n", "buf = leave_free_mbs(1000)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "*** Experiment started with the Pytorch backend\n", "Device: ID 0, GeForce GTX 1070 Ti (8119 RAM)\n", "\n", "\n", "*** Current state:\n", "RAM: Used Free Total Util\n", "CPU: 2284 19978 31588 MB 7.23% \n", "GPU: 7119 1000 8119 MB 87.68% \n", "\n", "\n", "・ RAM: △Consumed △Peaked Used Total | Exec time 0:00:00.000\n", "・ CPU: 0 0 2284 MB |\n", "・ GPU: 0 0 7119 MB |\n" ] } ], "source": [ "exp = IPyExperimentsPytorch()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "・ RAM: △Consumed △Peaked Used Total | Exec time 0:00:00.044\n", "・ CPU: 7 0 2291 MB |\n", "・ GPU: 0 0 7119 MB |\n" ] } ], "source": [ "path = untar_data(URLs.PETS)\n", "path_anno = path/'annotations'\n", "path_img = path/'images'\n", "fnames = get_image_files(path_img)\n", "np.random.seed(2)\n", "pat = re.compile(r'/([^/]+)_\\d+.jpg$')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "・ RAM: △Consumed △Peaked Used Total | Exec time 0:00:00.239\n", "・ CPU: 1 3 2343 MB |\n", "・ GPU: 0 0 7119 MB |\n" ] } ], "source": [ "#bs=128\n", "bs = 32\n", "\n", "data = ImageDataBunch.from_name_re(path_img, fnames, pat, ds_tfms=None, size=224, bs=bs).normalize(imagenet_stats)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "・ RAM: △Consumed △Peaked Used Total | Exec time 0:00:00.446\n", "・ CPU: 0 0 2480 MB |\n", "・ GPU: 110 0 7229 MB |\n" ] } ], "source": [ "learn = create_cnn(data, models.resnet34, metrics=accuracy)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
| epoch | \n", "train_loss | \n", "valid_loss | \n", "accuracy | \n", "
|---|