{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Display changes in the text of an archived web page over time\n",
    "\n",
    "[View in GitHub](https://github.com/GLAM-Workbench/web-archives/blob/master/display-text-changes-from-timemap.ipynb) &middot; [View in GLAM Workbench](https://glam-workbench.net/web-archives/#display-changes-in-the-text-of-an-archived-web-page-over-time)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This notebook is designed to run in Voila as an app (with the code hidden).\n",
    "# To launch this notebook in Voila, just select 'View > Open with Voila in New Browser Tab'\n",
    "# Your browser might ask for permission to open the new tab as a popup."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook displays changes in the text content of a web page over time. It retrieves a list of available captures from a Memento Timemap, then compares each capture with its predecessor, displaying changes side-by-side.\n",
    "\n",
    "By default, the notebook only displays lines that have *changed*. If you want to see more context, you can adjust the parameters in the `show_all_differences()` function to show lines around each change, or the complete text content."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "import re\n",
    "from collections import deque\n",
    "from difflib import HtmlDiff\n",
    "from urllib.parse import parse_qs, quote\n",
    "\n",
    "import arrow\n",
    "import ipywidgets as widgets\n",
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "from IPython.display import HTML, display\n",
    "\n",
    "# This is to restyle the standard html table output from difflib\n",
    "HTML(\n",
    "    \"<style>.diff_add {background-color: #ccffcc;} .diff_chg {background-color: #ffffcc; text-decoration: underline;} .diff_sub {background-color: #ffcccc; text-decoration: line-through;} table.diff, table.diff thead {border: 1px solid black;} table.diff {table-layout: fixed; width: 100%; margin-bottom: 20px;} th.diff_next, td.diff_next {width: 4%;} table.diff th.diff_header {text-align: left;} table.diff tbody {border: none;} table.diff td {word-break: break-word; text-align: left;}</style>\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Default list of repositories -- you could add to this\n",
    "TIMEGATES = {\n",
    "    \"nla\": \"https://web.archive.org.au/awa/\",\n",
    "    \"nlnz\": \"https://ndhadeliver.natlib.govt.nz/webarchive/wayback/\",\n",
    "    \"bl\": \"https://www.webarchive.org.uk/wayback/archive/\",\n",
    "    \"ia\": \"https://web.archive.org/web/\",\n",
    "    \"ukgwa\": \"https://webarchive.nationalarchives.gov.uk/ukgwa/\"\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# This deque will only hold a maximum of two values\n",
    "# So we can just push new pages into it, and it will shove the old ones out the back.\n",
    "html_data = deque(\"\", 2)\n",
    "\n",
    "\n",
    "def get_html(url):\n",
    "    response = requests.get(url)\n",
    "    # Sometimes the Mementos don't go to captures?!\n",
    "    # Eg https://web.archive.org.au/awa/20090912180610id_/http://www.discontents.com.au/\n",
    "    try:\n",
    "        re.search(r\"/(\\d{12}|\\d{14})id_/\", response.url).group(1)\n",
    "    except AttributeError:\n",
    "        return None\n",
    "    return {\"url\": response.url, \"html\": response.content}\n",
    "\n",
    "\n",
    "def format_date(url):\n",
    "    timestamp = re.search(r\"/(\\d{12}|\\d{14})id_/\", url).group(1)\n",
    "    return arrow.get(timestamp, \"YYYYMMDDHHmmss\").format(\"D MMMM YYYY\")\n",
    "\n",
    "\n",
    "def convert_lists_to_dicts(results):\n",
    "    \"\"\"\n",
    "    Converts IA style timemap (a JSON array of arrays) to a list of dictionaries.\n",
    "    Renames keys to standardise IA with other Timemaps.\n",
    "    \"\"\"\n",
    "    if results:\n",
    "        keys = results[0]\n",
    "        results_as_dicts = [dict(zip(keys, v)) for v in results[1:]]\n",
    "    else:\n",
    "        results_as_dicts = results\n",
    "    # Rename keys\n",
    "    for d in results_as_dicts:\n",
    "        d[\"status\"] = d.pop(\"statuscode\")\n",
    "        d[\"mime\"] = d.pop(\"mimetype\")\n",
    "        d[\"url\"] = d.pop(\"original\")\n",
    "    return results_as_dicts\n",
    "\n",
    "\n",
    "def get_capture_data_from_memento(url, request_type=\"head\"):\n",
    "    \"\"\"\n",
    "    For OpenWayback systems this can get some extra capture info to insert in Timemaps.\n",
    "    \"\"\"\n",
    "    if request_type == \"head\":\n",
    "        response = requests.head(url)\n",
    "    else:\n",
    "        response = requests.get(url)\n",
    "    headers = response.headers\n",
    "    length = headers.get(\"x-archive-orig-content-length\")\n",
    "    status = headers.get(\"x-archive-orig-status\")\n",
    "    status = status.split(\" \")[0] if status else None\n",
    "    mime = headers.get(\"x-archive-orig-content-type\")\n",
    "    mime = mime.split(\";\")[0] if mime else None\n",
    "    return {\"length\": length, \"status\": status, \"mime\": mime}\n",
    "\n",
    "\n",
    "def convert_link_to_json(results, enrich_data=False):\n",
    "    \"\"\"\n",
    "    Converts link formatted Timemap to JSON.\n",
    "    \"\"\"\n",
    "    data = []\n",
    "    for line in results.splitlines():\n",
    "        parts = line.split(\"; \")\n",
    "        if len(parts) > 1:\n",
    "            link_type = re.search(\n",
    "                r'rel=\"(original|self|timegate|first memento|last memento|memento)\"',\n",
    "                parts[1],\n",
    "            ).group(1)\n",
    "            if link_type == \"memento\":\n",
    "                link = parts[0].strip(\"<>\")\n",
    "                timestamp, original = re.search(r\"/(\\d{12}|\\d{14})/(.*)$\", link).groups()\n",
    "                capture = {\"timestamp\": timestamp, \"url\": original}\n",
    "                if enrich_data:\n",
    "                    capture.update(get_capture_data_from_memento(link))\n",
    "                data.append(capture)\n",
    "    return data\n",
    "\n",
    "\n",
    "def get_timemap_as_json(timegate, url):\n",
    "    \"\"\"\n",
    "    Get a Timemap then normalise results (if necessary) to return a list of dicts.\n",
    "    \"\"\"\n",
    "    tg_url = f\"{TIMEGATES[timegate]}timemap/json/{url}/\"\n",
    "    response = requests.get(tg_url)\n",
    "    response_type = response.headers[\"content-type\"]\n",
    "    # pywb style Timemap\n",
    "    if response_type == \"text/x-ndjson\":\n",
    "        data = [json.loads(line) for line in response.text.splitlines()]\n",
    "    # IA Wayback stype Timemap\n",
    "    elif response_type == \"application/json\":\n",
    "        data = convert_lists_to_dicts(response.json())\n",
    "    # Link style Timemap (OpenWayback)\n",
    "    elif response_type in [\"application/link-format\", \"text/html;charset=utf-8\"]:\n",
    "        data = convert_link_to_json(response.text)\n",
    "    return data\n",
    "\n",
    "\n",
    "def process_text(html):\n",
    "    \"\"\"\n",
    "    Extract text from an HTML page and return it as a list of lines.\n",
    "    Removes blank lines.\n",
    "    \"\"\"\n",
    "    lines = [\n",
    "        line\n",
    "        for line in BeautifulSoup(html).get_text().splitlines()\n",
    "        if not re.match(r\"^\\s*$\", line)\n",
    "    ]\n",
    "    return lines\n",
    "\n",
    "\n",
    "def format_date_link(url):\n",
    "    date = format_date(url)\n",
    "    return f'<a href=\"{url}\">{date}</a>'\n",
    "\n",
    "\n",
    "def show_line_differences(context=True, numlines=0):\n",
    "    \"\"\"\n",
    "    Use difflib to show a side-by-side comparison of the text in two web pages.\n",
    "    \"\"\"\n",
    "    differ = HtmlDiff()\n",
    "    doc1 = process_text(html_data[0][\"html\"])\n",
    "    doc2 = process_text(html_data[1][\"html\"])\n",
    "    date1 = format_date_link(html_data[0][\"url\"])\n",
    "    date2 = format_date_link(html_data[1][\"url\"])\n",
    "    html = differ.make_table(\n",
    "        doc1, doc2, context=context, numlines=numlines, fromdesc=date1, todesc=date2\n",
    "    )\n",
    "    # Rewrite the table html to make the column widths work better\n",
    "    html = html.replace(\n",
    "        r'<th colspan=\"2\" class=\"diff_header\"',\n",
    "        '<th class=\"diff_next\"></th><th class=\"diff_header\"',\n",
    "    )\n",
    "    # Cleaning up the table output\n",
    "    html = html.replace('nowrap=\"nowrap\"', \"\")\n",
    "    html = html.replace(\"<tbody>\", \"\").replace(\"</tbody>\", \"\")\n",
    "    with out:\n",
    "        display(HTML(html))\n",
    "\n",
    "\n",
    "def show_all_differences(timegate, url):\n",
    "    \"\"\"\n",
    "    Get all captures for a page from a Timemap, then compare each page with its predecessor,\n",
    "    display changes side-by-side.\n",
    "    \"\"\"\n",
    "    global html_data\n",
    "    timemap = get_timemap_as_json(timegate, url)\n",
    "    with out:\n",
    "        key = '<b>Key</b><ul><li><span class=\"diff_sub\">deleted text</span></li><li><span class=\"diff_chg\">changed text</span></li><li><span class=\"diff_add\">added text</li></ul>'\n",
    "        display(HTML(key))\n",
    "    for i, capture in enumerate(timemap):\n",
    "        capture_url = f'{TIMEGATES[timegate]}{capture[\"timestamp\"]}id_/{capture[\"url\"]}'\n",
    "        if timegate == \"nlnz\" or (\n",
    "            capture[\"digest\"] != timemap[i - 1][\"digest\"] and capture[\"status\"] == \"200\"\n",
    "        ):\n",
    "            capture_data = get_html(capture_url)\n",
    "            if capture_data:\n",
    "                html_data.append(capture_data)\n",
    "                if len(html_data) == 2:\n",
    "                    # You could change the params below to show context around changes\n",
    "                    # context=False -- shows the whole document\n",
    "                    # numlines -- when context=True, the number of lines to show around the diff\n",
    "                    # numline=0 -- just the diffs\n",
    "                    show_line_differences(context=True, numlines=0)\n",
    "\n",
    "\n",
    "def share_this():\n",
    "    binder_url = \"https://mybinder.org/v2/gh/GLAM-Workbench/web-archives/master?urlpath=/voila/render/display-text-changes-from-timemap.ipynb\"\n",
    "    parameter_string = quote(f\"?url={target_url.value}&archive={repository.value}\")\n",
    "    share_url = f\"{binder_url}{parameter_string}\"\n",
    "    with out:\n",
    "        display(HTML(f'<p>Share this: <a href=\"{share_url}\">{share_url}</a></p>'))\n",
    "\n",
    "\n",
    "def start(e):\n",
    "    clear(\"e\")\n",
    "    show_all_differences(repository.value, target_url.value)\n",
    "    share_this()\n",
    "\n",
    "\n",
    "def clear(e):\n",
    "    global html_data\n",
    "    html_data.clear()\n",
    "    out.clear_output()\n",
    "\n",
    "\n",
    "query_string = os.environ.get(\"QUERY_STRING\", \"\")\n",
    "parameters = parse_qs(query_string)\n",
    "url = parameters.get(\"url\", [\"\"])[0]\n",
    "archive = parameters.get(\"archive\", [\"\"])[0]\n",
    "\n",
    "out = widgets.Output()\n",
    "\n",
    "repository = widgets.Dropdown(\n",
    "    options=[\n",
    "        (\"---\", \"\"),\n",
    "        (\"UK Web Archive\", \"bl\"),\n",
    "        (\"UK Government Web Archive\", \"ukgwa\"),\n",
    "        (\"National Library of Australia\", \"nla\"),\n",
    "        (\"National Library of New Zealand\", \"nlnz\"),\n",
    "        (\"Internet Archive\", \"ia\"),\n",
    "    ],\n",
    "    description=\"Archive:\",\n",
    "    disabled=False,\n",
    "    value=archive,\n",
    ")\n",
    "\n",
    "target_url = widgets.Text(description=\"URL:\", value=url)\n",
    "\n",
    "tc_button = widgets.Button(\n",
    "    description=\"Show text changes\",\n",
    "    button_style=\"primary\",\n",
    ")\n",
    "tc_button.on_click(start)\n",
    "clear_button = widgets.Button(description=\"Clear all\")\n",
    "clear_button.on_click(clear)\n",
    "\n",
    "display(widgets.HBox([repository, target_url], layout=widgets.Layout(padding=\"10px\")))\n",
    "display(widgets.HBox([tc_button, clear_button], layout=widgets.Layout(padding=\"10px\")))\n",
    "display(out)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%capture\n",
    "%load_ext dotenv\n",
    "%dotenv\n",
    "\n",
    "# Insert some values for automated testing\n",
    "\n",
    "if os.getenv(\"GW_STATUS\") == \"dev\":\n",
    "    url = \"http://discontents.com.au/2017-the-making-and-the-talking/\"\n",
    "    archive = \"ia\"\n",
    "\n",
    "    target_url.value = url\n",
    "    repository.value = archive"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# If values have been provided via url or above, then start automatically.\n",
    "# Note that Voila widgets don't load immediately, hence the polling to\n",
    "# make sure the start button exists.\n",
    "\n",
    "if url and archive:\n",
    "    script = \"\"\"\n",
    "    <script type=\"text/javascript\">\n",
    "        function start() {\n",
    "          if (document.querySelector(\"button\")) {\n",
    "            let button = document.querySelector(\"button.mod-primary\");\n",
    "            button.click();\n",
    "          } else {\n",
    "            setTimeout(start, 5);\n",
    "          }\n",
    "        }\n",
    "    start();\n",
    "    </script>\"\"\"\n",
    "    display(HTML(script))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----\n",
    "Created by [Tim Sherratt](https://timsherratt.org) for the [GLAM Workbench](https://glam-workbench.github.io). Support me by becoming a [GitHub sponsor](https://github.com/sponsors/wragge)!\n",
    "\n",
    "Work on this notebook was supported by the [IIPC Discretionary Funding Programme 2019-2020](http://netpreserve.org/projects/).\n",
    "\n",
    "The Web Archives section of the GLAM Workbench is sponsored by the [British Library](https://www.bl.uk/)."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}