{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "hide_input": false }, "outputs": [], "source": [ "#hide\n", "#default_exp clean\n", "from nbdev.showdoc import show_doc" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "import io,sys,json,glob,re\n", "from fastcore.script import call_parse,Param\n", "from nbdev.imports import Config\n", "from pathlib import Path" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#hide\n", "#For tests only\n", "from nbdev.imports import *" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Clean notebooks\n", "\n", "> Strip notebooks from superfluous metadata" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To avoid pointless conflicts while working with jupyter notebooks (with different execution counts or cell metadata), it is recommended to clean the notebooks before committing anything (done automatically if you install the git hooks with `nbdev_install_git_hooks`). The following functions are used to do that." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Utils" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def rm_execution_count(o):\n", " \"Remove execution count in `o`\"\n", " if 'execution_count' in o: o['execution_count'] = None" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "colab_json = \"application/vnd.google.colaboratory.intrinsic+json\"\n", "def clean_output_data_vnd(o):\n", " \"Remove `application/vnd.google.colaboratory.intrinsic+json` in data entries\"\n", " if 'data' in o:\n", " data = o['data']\n", " if colab_json in data:\n", " new_data = {k:v for k,v in data.items() if k != colab_json}\n", " o['data'] = new_data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def clean_cell_output(cell):\n", " \"Remove execution count in `cell`\"\n", " if 'outputs' in cell:\n", " for o in cell['outputs']:\n", " rm_execution_count(o)\n", " clean_output_data_vnd(o)\n", " o.get('metadata', o).pop('tags', None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "cell_metadata_keep = [\"hide_input\"]\n", "nb_metadata_keep = [\"kernelspec\", \"jekyll\", \"jupytext\", \"doc\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def clean_cell(cell, clear_all=False):\n", " \"Clean `cell` by removing superfluous metadata or everything except the input if `clear_all`\"\n", " rm_execution_count(cell)\n", " if 'outputs' in cell:\n", " if clear_all: cell['outputs'] = []\n", " else: clean_cell_output(cell)\n", " if cell['source'] == ['']: cell['source'] = []\n", " cell['metadata'] = {} if clear_all else {k:v for k,v in cell['metadata'].items() if k in cell_metadata_keep}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tst = {'cell_type': 'code',\n", " 'execution_count': 26,\n", " 'metadata': {'hide_input': True, 'meta': 23},\n", " 'outputs': [{'execution_count': 2, \n", " 'data': {\n", " 'application/vnd.google.colaboratory.intrinsic+json': {\n", " 'type': 'string'},\n", " 'plain/text': ['sample output',]\n", " },\n", " 'output': 'super'}],\n", " \n", " 'source': 'awesome_code'}\n", "tst1 = tst.copy()\n", "\n", "clean_cell(tst)\n", "test_eq(tst, {'cell_type': 'code',\n", " 'execution_count': None,\n", " 'metadata': {'hide_input': True},\n", " 'outputs': [{'execution_count': None, \n", " 'data': {'plain/text': ['sample output',]},\n", " 'output': 'super'}],\n", " 'source': 'awesome_code'})\n", "\n", "clean_cell(tst1, clear_all=True)\n", "test_eq(tst1, {'cell_type': 'code',\n", " 'execution_count': None,\n", " 'metadata': {},\n", " 'outputs': [],\n", " 'source': 'awesome_code'})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tst2 = {\n", " 'metadata': {'tags':[]},\n", " 'outputs': [{\n", " 'metadata': {\n", " 'tags':[]\n", " }}],\n", " \n", " \"source\": [\n", " \"\"\n", " ]}\n", "clean_cell(tst2, clear_all=False)\n", "test_eq(tst2, {\n", " 'metadata': {},\n", " 'outputs': [{\n", " 'metadata':{}}],\n", " 'source': []})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def clean_nb(nb, clear_all=False):\n", " \"Clean `nb` from superfluous metadata, passing `clear_all` to `clean_cell`\"\n", " for c in nb['cells']: clean_cell(c, clear_all=clear_all)\n", " nb['metadata'] = {k:v for k,v in nb['metadata'].items() if k in nb_metadata_keep }" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tst = {'cell_type': 'code',\n", " 'execution_count': 26,\n", " 'metadata': {'hide_input': True, 'meta': 23},\n", " 'outputs': [{'execution_count': 2,\n", " 'data': {\n", " 'application/vnd.google.colaboratory.intrinsic+json': {\n", " 'type': 'string'},\n", " 'plain/text': ['sample output',]\n", " },\n", " 'output': 'super'}],\n", " 'source': 'awesome_code'}\n", "nb = {'metadata': {'kernelspec': 'some_spec', 'jekyll': 'some_meta', 'meta': 37},\n", " 'cells': [tst]}\n", "\n", "clean_nb(nb)\n", "test_eq(nb['cells'][0], {'cell_type': 'code',\n", " 'execution_count': None,\n", " 'metadata': {'hide_input': True},\n", " 'outputs': [{'execution_count': None, \n", " 'data': { 'plain/text': ['sample output',]},\n", " 'output': 'super'}],\n", " 'source': 'awesome_code'})\n", "test_eq(nb['metadata'], {'kernelspec': 'some_spec', 'jekyll': 'some_meta'})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def _print_output(nb):\n", " \"Print `nb` in stdout for git things\"\n", " _output_stream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')\n", " x = json.dumps(nb, sort_keys=True, indent=1, ensure_ascii=False)\n", " _output_stream.write(x)\n", " _output_stream.write(\"\\n\")\n", " _output_stream.flush()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "BSLASH = '\\x5c'\n", "def clean_cr(s): return re.sub(fr'{BSLASH}{BSLASH}r(\\\\n)?', r'\\\\n', s)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "assert clean_cr(fr'a{BSLASH}r\\nb{BSLASH}rc\\n') == fr'a\\nb\\nc\\n'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Main function" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "@call_parse\n", "def nbdev_clean_nbs(fname:Param(\"A notebook name or glob to convert\", str)=None,\n", " clear_all:Param(\"Clean all metadata and outputs\", bool)=False,\n", " disp:Param(\"Print the cleaned outputs\", bool)=False,\n", " read_input_stream:Param(\"Read input stram and not nb folder\")=False):\n", " \"Clean all notebooks in `fname` to avoid merge conflicts\"\n", " #Git hooks will pass the notebooks in the stdin\n", " if read_input_stream and sys.stdin:\n", " input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')\n", " nb = json.load(input_stream)\n", " clean_nb(nb, clear_all=clear_all)\n", " _print_output(nb)\n", " return\n", " if fname is None:\n", " try: path = Config().path(\"nbs_path\")\n", " except Exception as e: path = Path.cwd()\n", " files = path.glob('**/*.ipynb') if fname is None else glob.glob(fname)\n", " for f in files:\n", " if not str(f).endswith('.ipynb'): continue\n", " nb_s = open(f, 'r', encoding='utf-8').read()\n", " nb = json.loads(clean_cr(nb_s))\n", " clean_nb(nb, clear_all=clear_all)\n", " if disp: _print_output(nb)\n", " else:\n", " x = json.dumps(nb, sort_keys=True, indent=1, ensure_ascii=False)\n", " with io.open(f, 'w', encoding='utf-8') as f:\n", " f.write(x)\n", " f.write(\"\\n\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "By default (`fname` left to `None`), the all the notebooks in `lib_folder` are cleaned. You can opt in to fully clean the notebook by removing every bit of metadata and the cell outputs by passing `clear_all=True`. `disp` is only used for internal use with git hooks and will print the clean notebook instead of saving it. Same for `read_input_stream` that will read the notebook from the input stream instead of the file names." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Export -" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Converted 00_export.ipynb.\n", "Converted 01_sync.ipynb.\n", "Converted 02_showdoc.ipynb.\n", "Converted 03_export2html.ipynb.\n", "Converted 04_test.ipynb.\n", "Converted 05_merge.ipynb.\n", "Converted 06_cli.ipynb.\n", "Converted 07_clean.ipynb.\n", "Converted 99_search.ipynb.\n", "Converted example.ipynb.\n", "Converted index.ipynb.\n", "Converted tutorial.ipynb.\n" ] } ], "source": [ "#hide\n", "from nbdev.export import notebook2script\n", "notebook2script()" ] } ], "metadata": { "jupytext": { "split_at_heading": true }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 0 }