{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "89fe6b40", "metadata": {}, "outputs": [], "source": [ "# default_exp core" ] }, { "cell_type": "markdown", "id": "3f88277e", "metadata": {}, "source": [ "# Core API" ] }, { "cell_type": "code", "execution_count": null, "id": "9e4b351a", "metadata": {}, "outputs": [], "source": [ "# export\n", "from fastprogress.fastprogress import progress_bar\n", "from fastcore.all import *\n", "import hashlib,shutil\n", "from pprint import pformat" ] }, { "cell_type": "code", "execution_count": null, "id": "26a28dbd", "metadata": {}, "outputs": [], "source": [ "#hide\n", "from nbdev.showdoc import show_doc\n", "import tempfile,fastdownload" ] }, { "cell_type": "markdown", "id": "37ccd175", "metadata": {}, "source": [ "## Helpers" ] }, { "cell_type": "markdown", "id": "31596f8a", "metadata": {}, "source": [ "This helper functions provide the functionality that `FastDownload` relies on. Most users should use `FastDownload` rather than calling these helpers." ] }, { "cell_type": "code", "execution_count": null, "id": "5087e93f", "metadata": {}, "outputs": [], "source": [ "dest = Path('tmp')\n", "url = 'https://s3.amazonaws.com/fast-ai-sample/mnist_tiny.tgz'" ] }, { "cell_type": "code", "execution_count": null, "id": "6ddb7048", "metadata": {}, "outputs": [], "source": [ "#hide\n", "shutil.rmtree(dest, ignore_errors=True)\n", "Path.BASE_PATH = Path.home()" ] }, { "cell_type": "code", "execution_count": null, "id": "171df7b7", "metadata": {}, "outputs": [], "source": [ "#export\n", "def download_url(url, dest=None, timeout=None, show_progress=True):\n", " \"Download `url` to `dest` and show progress\"\n", " pbar = progress_bar([])\n", " def progress(count=1, bsize=1, tsize=None):\n", " pbar.total = tsize\n", " pbar.update(count*bsize)\n", " return urlsave(url, dest, reporthook=progress if show_progress else None, timeout=timeout)" ] }, { "cell_type": "code", "execution_count": null, "id": "7387f086", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " 100.54% [344064/342207 00:00<00:00]\n", "
\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Path('tmp/mnist_tiny.tgz')" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dest.mkdir(exist_ok=True)\n", "fpath = download_url(url, dest)\n", "fpath" ] }, { "cell_type": "code", "execution_count": null, "id": "cc6c19e5", "metadata": {}, "outputs": [], "source": [ "# export\n", "def path_stats(fpath):\n", " \"`True` if size and hash of `fpath` matches `size_check` and `hash_check`\"\n", " size = os.path.getsize(fpath)\n", " # Just use first 1MB of file for performance\n", " with open(fpath, \"rb\") as f: hashed = hashlib.md5(f.read(2**20)).hexdigest()\n", " return size,hashed" ] }, { "cell_type": "code", "execution_count": null, "id": "23a8fc55", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(342207, '56143e8f24db90d925d82a5a74141875')" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "path_stats(fpath)" ] }, { "cell_type": "code", "execution_count": null, "id": "137fe163", "metadata": {}, "outputs": [], "source": [ "#export\n", "def checks_module(module):\n", " \"Location of `download_checks.py`\"\n", " if not module: return {}\n", " return Path(module.__file__).parent/'download_checks.py'" ] }, { "cell_type": "markdown", "id": "46fc8a5c", "metadata": {}, "source": [ "The `download_checks.py` file containing sizes and hashes will be located next to `module`:" ] }, { "cell_type": "code", "execution_count": null, "id": "c1612552", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Path('git/fastdownload/fastdownload/download_checks.py')" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mod = checks_module(fastdownload)\n", "mod" ] }, { "cell_type": "code", "execution_count": null, "id": "630f3851", "metadata": {}, "outputs": [], "source": [ "#export\n", "def read_checks(fmod):\n", " \"Evaluated contents of `download_checks.py`\"\n", " if fmod == {} or not fmod.exists(): return {}\n", " txt = fmod.read_text()\n", " return eval(txt) if txt else {}" ] }, { "cell_type": "code", "execution_count": null, "id": "bc2cfbd2", "metadata": {}, "outputs": [], "source": [ "assert read_checks({}) == {}" ] }, { "cell_type": "code", "execution_count": null, "id": "d2ef9486", "metadata": {}, "outputs": [], "source": [ "#export\n", "def check(fmod, url, fpath):\n", " \"Check whether size and hash of `fpath` matches stored data for `url` or data is missing\"\n", " checks = read_checks(fmod).get(url)\n", " return not checks or path_stats(fpath)==checks" ] }, { "cell_type": "code", "execution_count": null, "id": "7da6bb9e", "metadata": {}, "outputs": [], "source": [ "# export\n", "def update_checks(fpath, url, fmod):\n", " \"Store the hash and size of `fpath` for `url` in `download_checks.py`\"\n", " checks = read_checks(fmod)\n", " checks[url] = path_stats(fpath)\n", " fmod.write_text(pformat(checks))" ] }, { "cell_type": "code", "execution_count": null, "id": "3381a3f0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'https://s3.amazonaws.com/fast-ai-sample/mnist_tiny.tgz': (342207,\n", " '56143e8f24db90d925d82a5a74141875')}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "if mod.exists(): mod.unlink()\n", "update_checks(fpath, url, mod)\n", "read_checks(mod)" ] }, { "cell_type": "code", "execution_count": null, "id": "ed925e7d", "metadata": {}, "outputs": [], "source": [ "#export\n", "def download_and_check(url, fpath, fmod, force):\n", " \"Download `url` to `fpath`, unless exists and `check` fails and not `force`\"\n", " if not force and fpath.exists():\n", " if check(fmod, url, fpath): return fpath\n", " else: print(\"Downloading a new version of this dataset...\")\n", " res = download_url(url, fpath)\n", " if not check(fmod, url, fpath): raise Exception(\"Downloaded file is corrupt or not latest version\")\n", " return res" ] }, { "cell_type": "markdown", "id": "848fe19d", "metadata": {}, "source": [ "## FastDownload -" ] }, { "cell_type": "code", "execution_count": null, "id": "332fce58", "metadata": {}, "outputs": [], "source": [ "# export\n", "class FastDownload:\n", " def __init__(self, cfg=None, base='~/.fastdownload', archive=None, data=None, module=None):\n", " base = Path(base).expanduser().absolute()\n", " default = {'data':(data or 'data'), 'archive':(archive or 'archive')}\n", " self.cfg = Config(base, 'config.ini', create=default) if cfg is None else cfg\n", " self.module = checks_module(module)\n", " if data is not None: self.cfg['data'] = data\n", " if archive is not None: self.cfg['archive'] = archive\n", " \n", " def arch_path(self):\n", " \"Path to archives\"\n", " return self.cfg.path('archive')\n", " \n", " def data_path(self, extract_key='data', arch=None):\n", " \"Path to extracted data\"\n", " path = self.cfg.path(extract_key)\n", " return path if arch is None else path/remove_suffix(arch.stem, '.tar')\n", "\n", " def check(self, url, fpath):\n", " \"Check whether size and hash of `fpath` matches stored data for `url` or data is missing\"\n", " checks = read_checks(self.module).get(url)\n", " return not checks or path_stats(fpath)==checks\n", "\n", " def download(self, url, force=False):\n", " \"Download `url` to archive path, unless exists and `self.check` fails and not `force`\"\n", " self.arch_path().mkdir(exist_ok=True, parents=True)\n", " return download_and_check(url, urldest(url, self.arch_path()), self.module, force)\n", " \n", " def rm(self, url, rm_arch=True, rm_data=True, extract_key='data'):\n", " \"Delete downloaded archive and extracted data for `url`\"\n", " arch = urldest(url, self.arch_path())\n", " if rm_arch: arch.delete()\n", " if rm_data: self.data_path(extract_key, arch).delete()\n", "\n", " def update(self, url):\n", " \"Store the hash and size in `download_checks.py`\"\n", " update_checks(urldest(url, self.arch_path()), url, self.module)\n", "\n", " def extract(self, url, extract_key='data', force=False):\n", " \"Extract archive already downloaded from `url`, overwriting existing if `force`\"\n", " arch = urldest(url, self.arch_path())\n", " if not arch.exists(): raise Exception(f'{arch} does not exist')\n", " dest = self.data_path(extract_key)\n", " dest.mkdir(exist_ok=True, parents=True)\n", " return untar_dir(arch, dest, rename=True, overwrite=force)\n", " \n", " def get(self, url, extract_key='data', force=False):\n", " \"Download and extract `url`, overwriting existing if `force`\"\n", " if not force:\n", " data = self.data_path(extract_key, urldest(url, self.arch_path()))\n", " if data.exists(): return data\n", " self.download(url, force=force)\n", " return self.extract(url, extract_key=extract_key, force=force)" ] }, { "cell_type": "code", "execution_count": null, "id": "8f14ffab", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Path('git/fastdownload/fastdownload/download_checks.py')" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d = FastDownload(module=fastdownload)\n", "d.module" ] }, { "cell_type": "markdown", "id": "905e2fcd", "metadata": {}, "source": [ "The `config.ini` file will be created (if it doesn't exist) in `{base}/config.ini`:" ] }, { "cell_type": "code", "execution_count": null, "id": "50d8a3e3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Path('.fastdownload/config.ini')" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d.cfg.config_file" ] }, { "cell_type": "code", "execution_count": null, "id": "828cdfd9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[DEFAULT]\n", "data = /home/jhoward/.fastdownload/data\n", "archive = /home/jhoward/.fastdownload/archive\n", "\n", "\n" ] } ], "source": [ "print(d.cfg.config_file.read_text())" ] }, { "cell_type": "code", "execution_count": null, "id": "fcc9d707", "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "

FastDownload.download[source]

\n", "\n", "> FastDownload.download(**`url`**, **`force`**=*`False`*)\n", "\n", "Download `url` to archive path, unless exists and `self.check` fails and not `force`" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "show_doc(FastDownload.download)" ] }, { "cell_type": "markdown", "id": "73ce81c5", "metadata": {}, "source": [ "If there is no stored hash and size for `url`, or the size and hash matches the stored checks, then `download` will only download the URL if the destination file does not exist. The destination path will be retured." ] }, { "cell_type": "code", "execution_count": null, "id": "2e8770c8", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " 100.54% [344064/342207 00:00<00:00]\n", "
\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "Path('.fastdownload/archive/mnist_tiny.tgz')" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "if d.module.exists(): d.module.unlink()\n", "arch = d.download(url)\n", "arch" ] }, { "cell_type": "code", "execution_count": null, "id": "4b37c9b6", "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "

FastDownload.update[source]

\n", "\n", "> FastDownload.update(**`url`**)\n", "\n", "Store the hash and size in `download_checks.py`" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "show_doc(FastDownload.update)" ] }, { "cell_type": "code", "execution_count": null, "id": "e9e2cc9a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'https://s3.amazonaws.com/fast-ai-sample/mnist_tiny.tgz': (342207,\n", " '56143e8f24db90d925d82a5a74141875')}" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d.update(url)\n", "eval(d.module.read_text())" ] }, { "cell_type": "markdown", "id": "0f8cddc9", "metadata": {}, "source": [ "Calling `download` will now just return the existing file, since the checks match:" ] }, { "cell_type": "code", "execution_count": null, "id": "85dedf07", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Path('.fastdownload/archive/mnist_tiny.tgz')" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d.download(url)" ] }, { "cell_type": "markdown", "id": "c1058c1c", "metadata": {}, "source": [ "If the checks file doesn't match the size or hash of the archive, then a new copy of the file will be downloaded." ] }, { "cell_type": "code", "execution_count": null, "id": "0ca69478", "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "

FastDownload.extract[source]

\n", "\n", "> FastDownload.extract(**`url`**, **`extract_key`**=*`'data'`*, **`force`**=*`False`*)\n", "\n", "Extract archive already downloaded from `url`, overwriting existing if `force`" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "show_doc(FastDownload.extract)" ] }, { "cell_type": "code", "execution_count": null, "id": "75bcbbed", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Path('.fastdownload/data/mnist_tiny')" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "extr = d.extract(url, force=True)\n", "extr" ] }, { "cell_type": "code", "execution_count": null, "id": "95379b04", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(#5) [Path('.fastdownload/data/mnist_tiny/models'),Path('.fastdownload/data/mnist_tiny/train'),Path('.fastdownload/data/mnist_tiny/labels.csv'),Path('.fastdownload/data/mnist_tiny/valid'),Path('.fastdownload/data/mnist_tiny/test')]" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "extr.ls()" ] }, { "cell_type": "markdown", "id": "5246ccb0", "metadata": {}, "source": [ "Pass `extract_key` to use a key other than `data` from your config file when selecting an archive extraction location:" ] }, { "cell_type": "code", "execution_count": null, "id": "e9a3a2c2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Path('.fastdownload/models/mnist_tiny')" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d.cfg['model_path'] = 'models'\n", "d.extract(url, extract_key='model_path')" ] }, { "cell_type": "code", "execution_count": null, "id": "6dff677d", "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "

FastDownload.rm[source]

\n", "\n", "> FastDownload.rm(**`url`**, **`rm_arch`**=*`True`*, **`rm_data`**=*`True`*, **`extract_key`**=*`'data'`*)\n", "\n", "Delete downloaded archive and extracted data for `url`" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "show_doc(FastDownload.rm)" ] }, { "cell_type": "code", "execution_count": null, "id": "c1fcb5ed", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(False, False)" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d.rm(url)\n", "extr.exists(),arch.exists()" ] }, { "cell_type": "code", "execution_count": null, "id": "c0b4a6f4", "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "

FastDownload.get[source]

\n", "\n", "> FastDownload.get(**`url`**, **`extract_key`**=*`'data'`*, **`force`**=*`False`*)\n", "\n", "Download and extract `url`, overwriting existing if `force`" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "show_doc(FastDownload.get)" ] }, { "cell_type": "code", "execution_count": null, "id": "ef9e91ce", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " 100.54% [344064/342207 00:00<00:00]\n", "
\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "(Path('.fastdownload/data/mnist_tiny'), True)" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res = d.get(url)\n", "res,extr.exists()" ] }, { "cell_type": "markdown", "id": "decf909b", "metadata": {}, "source": [ "If the archive doesn't exist, but the extracted data does, then the archive is not downloaded again." ] }, { "cell_type": "code", "execution_count": null, "id": "250420cf", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(Path('.fastdownload/data/mnist_tiny'), True)" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d.rm(url, rm_data=False)\n", "res = d.get(url)\n", "res,extr.exists()" ] }, { "cell_type": "markdown", "id": "51ed36c7", "metadata": {}, "source": [ "`extract_key` works the same way as in `FastDownload.extract`:" ] }, { "cell_type": "code", "execution_count": null, "id": "387c1901", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(Path('.fastdownload/models/mnist_tiny'), True)" ] }, "execution_count": null, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res = d.get(url, extract_key='model_path')\n", "res,res.exists()" ] }, { "cell_type": "markdown", "id": "1cfa23cb", "metadata": {}, "source": [ "## Export -" ] }, { "cell_type": "code", "execution_count": null, "id": "52d848a4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Converted 00_core.ipynb.\n", "Converted index.ipynb.\n" ] } ], "source": [ "#hide\n", "from nbdev.export import notebook2script\n", "notebook2script()" ] }, { "cell_type": "code", "execution_count": null, "id": "24de7c05", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 5 }