{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#default_exp linkcheck"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# fastlinkcheck API\n",
    "> API for fast local and online link checking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "from fastcore.all import *\n",
    "from html.parser import HTMLParser\n",
    "from urllib.parse import urlparse,urlunparse\n",
    "from fastcore.script import SCRIPT_INFO\n",
    "import sys"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Find links in an HTML file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "class _HTMLParseAttrs(HTMLParser):\n",
    "    def reset(self):\n",
    "        super().reset()\n",
    "        self.found = set()\n",
    "    def handle_starttag(self, tag, attrs):\n",
    "        a = first(v for k,v in attrs if k in (\"src\",\"href\"))\n",
    "        if a: self.found.add(a)\n",
    "    handle_startendtag = handle_starttag"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def get_links(fn):\n",
    "    \"List of all links in file `fn`\"\n",
    "    h = _HTMLParseAttrs()\n",
    "    h.feed(Path(fn).read_text())\n",
    "    return L(h.found)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can use `get_links` to parse an HTML file for different types of links.  For example, this is the contents of `./example/test.html`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<a href=\"//somecdn.com/doesntexist.html\"></a>\n",
      "<a href=\"http://www.bing.com\"></a>\n",
      "<script src=\"test.js\"></script>\n",
      "<img src=\"http://fastlinkcheck.com/test.html\" />\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "example = Path('_example/test.html')\n",
    "print(example.read_text())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Calling `get_links` with the above file path will return a list of links:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "links = get_links(example)\n",
    "test_eq(set(links), {'test.js',\n",
    "                     '//somecdn.com/doesntexist.html',\n",
    "                     'http://www.bing.com','http://fastlinkcheck.com/test.html'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def _local_url(u, root, host, fname):\n",
    "    \"Change url `u` to local path if it is a local link\"\n",
    "    fpath = Path(fname).parent\n",
    "    islocal=False\n",
    "    # remove `host` prefix\n",
    "    for o in 'http://','https://','http://www.','https://www.':\n",
    "        if u.startswith(o+host): u,islocal = remove_prefix(u, o+host),True\n",
    "    # remove params, querystring, and fragment\n",
    "    p = list(urlparse(u))[:5]+['']\n",
    "    # local prefix, or no protocol or host\n",
    "    if islocal or (not p[0] and not p[1]):\n",
    "        u = p[2]\n",
    "        if u and u[0]=='/': return (root/u[1:]).resolve()\n",
    "        else: return (fpath/u).resolve()\n",
    "    # URLs without a protocol are \"protocol relative\"\n",
    "    if not p[0]: p[0]='http'\n",
    "    # mailto etc are not checked\n",
    "    if p[0] not in ('http','https'): return ''\n",
    "    return urlunparse(p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "class _LinkMap(dict):\n",
    "    \"\"\"A dict that pretty prints Links and their associated locations.\"\"\"\n",
    "    def _repr_locs(self, k): return '\\n'.join(f'  - `{p}`' for p in self[k])\n",
    "    def __repr__(self):\n",
    "        rstr = L(f'- {k!r} was found in the following pages:\\n{self._repr_locs(k)}' for k in self).concat()\n",
    "        return '\\n'.join(rstr)\n",
    "    _repr_markdown_ = __repr__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def local_urls(path:Path, host:str):\n",
    "    \"returns a `dict` mapping all HTML files in `path` to a list of locally-resolved links in that file\"\n",
    "    path=Path(path)\n",
    "    fns = L(path.glob('**/*.html'))+L(path.glob('**/*.htm'))\n",
    "    found = [(fn.resolve(),_local_url(link, root=path, host=host, fname=fn))\n",
    "             for fn in fns for link in get_links(fn)]\n",
    "    return _LinkMap(groupby(found, 1, 0))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The keys of the `dict` returned by `local_urls` are links found in HTML files, and the values of this `dict` are a list of paths that those links are found in.  \n",
    "\n",
    "Furthermore, local links are returned as `Path` objects, whereas external URLs are strings.  For example, notice how the link:\n",
    "\n",
    "\n",
    "`http://fastlinkcheck.com/test.html`\n",
    "\n",
    "\n",
    "is resolved to a local path, because the `host` parameter supplied to `local_urls`, `fastlinkcheck.com` matches the url in the link: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/markdown": [
       "- 'http://somecdn.com/doesntexist.html' was found in the following pages:\n",
       "  - `/Users/hamelsmu/github/fastlinkcheck/_example/test.html`\n",
       "- Path('/Users/hamelsmu/github/fastlinkcheck/_example/test.html') was found in the following pages:\n",
       "  - `/Users/hamelsmu/github/fastlinkcheck/_example/test.html`\n",
       "- 'http://www.bing.com' was found in the following pages:\n",
       "  - `/Users/hamelsmu/github/fastlinkcheck/_example/test.html`\n",
       "- Path('/Users/hamelsmu/github/fastlinkcheck/_example/test.js') was found in the following pages:\n",
       "  - `/Users/hamelsmu/github/fastlinkcheck/_example/test.html`"
      ],
      "text/plain": [
       "- 'http://somecdn.com/doesntexist.html' was found in the following pages:\n",
       "  - `/Users/hamelsmu/github/fastlinkcheck/_example/test.html`\n",
       "- Path('/Users/hamelsmu/github/fastlinkcheck/_example/test.html') was found in the following pages:\n",
       "  - `/Users/hamelsmu/github/fastlinkcheck/_example/test.html`\n",
       "- 'http://www.bing.com' was found in the following pages:\n",
       "  - `/Users/hamelsmu/github/fastlinkcheck/_example/test.html`\n",
       "- Path('/Users/hamelsmu/github/fastlinkcheck/_example/test.js') was found in the following pages:\n",
       "  - `/Users/hamelsmu/github/fastlinkcheck/_example/test.html`"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "path = Path('./_example')\n",
    "links = local_urls(path, host='fastlinkcheck.com')\n",
    "links"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Finding broken links"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def broken_local(links, ignore_paths=None):\n",
    "    \"List of items in keys of `links` that are `Path`s that do not exist\"\n",
    "    ignore_paths = setify(ignore_paths)\n",
    "    return L(o for o in links if isinstance(o,Path) and o not in ignore_paths and not o.exists())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Since `test.js` does not exist in the `example/` directory, `broken_local` returns this path:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(#1) [Path('/Users/hamelsmu/github/fastlinkcheck/_example/test.js')]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "broken_local(links)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "assert not all([x.exists() for x in broken_local(links)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def broken_urls(links, ignore_urls=None):\n",
    "    \"List of items in keys of `links` that are URLs that return a failure status code\"\n",
    "    ignore_urls = setify(ignore_urls)\n",
    "    its = L(o for o in links if isinstance(o, str) and o not in ignore_urls)\n",
    "    working_urls = parallel(urlcheck, its, n_workers=32, threadpool=True)\n",
    "    return L(o for o,p in zip(its,working_urls) if not p)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Similarly the url `http://somecdn.com/doesntexist.html` doesn't exist, which is why it is returned by `broken_urls`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "assert broken_urls(links) == ['http://somecdn.com/doesntexist.html']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "@call_parse\n",
    "def link_check(path:Param(\"Root directory searched recursively for HTML files\", str),\n",
    "               host:Param(\"Host and path (without protocol) of web server\", str)='',\n",
    "               config_file:Param(\"Location of file with urls to ignore\",str)=None,\n",
    "               actions_output:Param(\"Toggle GitHub Actions output on/off\",store_true)=False,\n",
    "               exit_on_found:Param(\"(CLI Only) Exit with status code 1 if broken links are found\", store_true)=False,\n",
    "               print_logs:Param(\"Toggle printing logs to stdout.\", store_true)=False):\n",
    "    \"\"\"Check for broken links recursively in `path`.\"\"\"\n",
    "    path = Path(path)\n",
    "    is_cli = (SCRIPT_INFO.func == 'link_check')\n",
    "    assert path.exists(), f\"{path.absolute()} does not exist.\"\n",
    "    if config_file: assert Path(config_file).is_file(), f\"{config_file} is either not a file or doesn't exist.\"\n",
    "    ignore = L(x.strip() for x in (Path(config_file).readlines() if config_file else ''))\n",
    "    links = local_urls(path, host=host)\n",
    "    ignore_paths = set((path/o).resolve() for o in ignore if not urlvalid(o))\n",
    "    ignore_urls = set(ignore.filter(urlvalid))\n",
    "    lm = _LinkMap({k:links[k] for k in (broken_urls(links, ignore_urls) + broken_local(links, ignore_paths))})\n",
    "    if actions_output: print(f\"::set-output name=broken_links::{bool(lm)}\")\n",
    "    msg = f'\\nERROR: The Following Broken Links or Paths were found:\\n{lm}' if lm else 'No Broken Links Found!'\n",
    "    if print_logs or is_cli: print(msg)\n",
    "    if is_cli and lm and exit_on_found: sys.exit(1)\n",
    "    else: return lm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "- 'http://somecdn.com/doesntexist.html' was found in the following pages:\n",
       "  - `/Users/hamelsmu/github/fastlinkcheck/_example/test.html`\n",
       "- Path('/Users/hamelsmu/github/fastlinkcheck/_example/test.js') was found in the following pages:\n",
       "  - `/Users/hamelsmu/github/fastlinkcheck/_example/test.html`"
      ],
      "text/plain": [
       "- 'http://somecdn.com/doesntexist.html' was found in the following pages:\n",
       "  - `/Users/hamelsmu/github/fastlinkcheck/_example/test.html`\n",
       "- Path('/Users/hamelsmu/github/fastlinkcheck/_example/test.js') was found in the following pages:\n",
       "  - `/Users/hamelsmu/github/fastlinkcheck/_example/test.html`"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "link_check(path='_example', host='fastlinkcheck.com')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Ignore links with a configuration file"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can choose to ignore files with a a plain-text file containing a list of urls to ignore.  For example, the file `linkcheck.rc` contains a list of urls I want to ignore:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test.js\n",
      "https://www.google.com\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print((path/'linkcheck.rc').read_text())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this case `example/test.js` will be filtered out from the list:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/markdown": [
       "- 'http://somecdn.com/doesntexist.html' was found in the following pages:\n",
       "  - `/Users/hamelsmu/github/fastlinkcheck/_example/test.html`"
      ],
      "text/plain": [
       "- 'http://somecdn.com/doesntexist.html' was found in the following pages:\n",
       "  - `/Users/hamelsmu/github/fastlinkcheck/_example/test.html`"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "link_check(path='_example', host='fastlinkcheck.com', config_file='_example/linkcheck.rc')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can optionally emit the variable `broken_links` for GitHub Actions [as discussed here](https://docs.github.com/en/free-pro-team@latest/actions/reference/workflow-commands-for-github-actions#setting-an-output-parameter).  This variable will be either `True` or `False` depending on if broken links are found or not:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "::set-output name=broken_links::True\n"
     ]
    },
    {
     "data": {
      "text/markdown": [
       "- 'http://somecdn.com/doesntexist.html' was found in the following pages:\n",
       "  - `/Users/hamelsmu/github/fastlinkcheck/_example/test.html`\n",
       "- Path('/Users/hamelsmu/github/fastlinkcheck/_example/test.js') was found in the following pages:\n",
       "  - `/Users/hamelsmu/github/fastlinkcheck/_example/test.html`"
      ],
      "text/plain": [
       "- 'http://somecdn.com/doesntexist.html' was found in the following pages:\n",
       "  - `/Users/hamelsmu/github/fastlinkcheck/_example/test.html`\n",
       "- Path('/Users/hamelsmu/github/fastlinkcheck/_example/test.js') was found in the following pages:\n",
       "  - `/Users/hamelsmu/github/fastlinkcheck/_example/test.html`"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "link_check(path=\"_example\", host=\"fastlinkcheck.com\", actions_output=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`link_check` can also be called use from the command line like this:\n",
    "\n",
    "> Note: the `!` command in Jupyter allows you [run shell commands](https://stackoverflow.com/questions/38694081/executing-terminal-commands-in-jupyter-notebook/48529220)\n",
    "\n",
    "The `-h` or `--help` flag will allow you to see the command line docs:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "usage: link_check [-h] [--host HOST] [--config_file CONFIG_FILE]\r\n",
      "                  [--actions_output] [--exit_on_found] [--print_logs] [--pdb]\r\n",
      "                  [--xtra XTRA]\r\n",
      "                  path\r\n",
      "\r\n",
      "Check for broken links recursively in `path`.\r\n",
      "\r\n",
      "positional arguments:\r\n",
      "  path                  Root directory searched recursively for HTML files\r\n",
      "\r\n",
      "optional arguments:\r\n",
      "  -h, --help            show this help message and exit\r\n",
      "  --host HOST           Host and path (without protocol) of web server\r\n",
      "                        (default: )\r\n",
      "  --config_file CONFIG_FILE\r\n",
      "                        Location of file with urls to ignore\r\n",
      "  --actions_output      Toggle GitHub Actions output on/off (default: False)\r\n",
      "  --exit_on_found       (CLI Only) Exit with status code 1 if broken links are\r\n",
      "                        found (default: False)\r\n",
      "  --print_logs          Toggle printing logs to stdout. (default: False)\r\n",
      "  --pdb                 Run in pdb debugger (default: False)\r\n",
      "  --xtra XTRA           Parse for additional args (default: '')\r\n"
     ]
    }
   ],
   "source": [
    "!link_check -h"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted index.ipynb.\n",
      "Converted linkcheck.ipynb.\n"
     ]
    }
   ],
   "source": [
    "#hide\n",
    "from nbdev.export import *\n",
    "notebook2script()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}