{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Display changes in the text of an archived web page over time\n", "\n", "This notebook displays changes in the text content of a web page over time. It retrieves a list of available captures from a Memento Timemap, then compares each capture with its predecessor, displaying changes side-by-side.\n", "\n", "By default, the notebook only displays lines that have *changed*. If you want to see more context, you can adjust the parameters in the `show_all_differences()` function to show lines around each change, or the complete text content." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Parameters:\n", "\n", "url = \"\"\n", "archive = \"\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from difflib import HtmlDiff\n", "import requests\n", "from IPython.display import display, HTML\n", "import re\n", "import arrow\n", "from bs4 import BeautifulSoup, Tag\n", "import ipywidgets as widgets\n", "import json\n", "from collections import deque\n", "from urllib.parse import quote\n", "import time\n", "\n", "# This is to restyle the standard html table output from difflib\n", "HTML('')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%javascript\n", "// This is necessary in Jupyter notebook to stop the output area folding up\n", "// Will give an error in Jupyter Lab\n", "IPython.OutputArea.prototype._should_scroll = function(lines) {return false}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Default list of repositories -- you could add to this\n", "TIMEGATES = {\n", " 'nla': 'https://web.archive.org.au/awa/',\n", " 'nlnz': 'https://ndhadeliver.natlib.govt.nz/webarchive/wayback/',\n", " 'bl': 'https://www.webarchive.org.uk/wayback/archive/',\n", " 'ia': 'https://web.archive.org/web/'\n", "}\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# This deque will only hold a maximum of two values\n", "# So we can just push new pages into it, and it will shove the old ones out the back.\n", "html_data = deque('', 2)\n", "\n", "def get_html(url):\n", " response = requests.get(url)\n", " # Sometimes the Mementos don't go to captures?!\n", " # Eg https://web.archive.org.au/awa/20090912180610id_/http://www.discontents.com.au/\n", " try:\n", " timestamp = re.search(r'/(\\d{14})id_/', response.url).group(1)\n", " except AttributeError:\n", " return None\n", " return {'url': response.url, 'html': response.content}\n", "\n", "def format_date(url):\n", " timestamp = re.search(r'/(\\d{14})id_/', url).group(1)\n", " return arrow.get(timestamp, 'YYYYMMDDHHmmss').format('D MMMM YYYY')\n", "\n", "def convert_lists_to_dicts(results):\n", " '''\n", " Converts IA style timemap (a JSON array of arrays) to a list of dictionaries.\n", " Renames keys to standardise IA with other Timemaps.\n", " '''\n", " if results:\n", " keys = results[0]\n", " results_as_dicts = [dict(zip(keys, v)) for v in results[1:]]\n", " else:\n", " results_as_dicts = results\n", " # Rename keys\n", " for d in results_as_dicts:\n", " d['status'] = d.pop('statuscode')\n", " d['mime'] = d.pop('mimetype')\n", " d['url'] = d.pop('original')\n", " return results_as_dicts\n", "\n", "def get_capture_data_from_memento(url, request_type='head'):\n", " '''\n", " For OpenWayback systems this can get some extra capture info to insert in Timemaps.\n", " '''\n", " if request_type == 'head':\n", " response = requests.head(url)\n", " else:\n", " response = requests.get(url)\n", " headers = response.headers\n", " length = headers.get('x-archive-orig-content-length')\n", " status = headers.get('x-archive-orig-status')\n", " status = status.split(' ')[0] if status else None\n", " mime = headers.get('x-archive-orig-content-type')\n", " mime = mime.split(';')[0] if mime else None\n", " return {'length': length, 'status': status, 'mime': mime}\n", "\n", "def convert_link_to_json(results, enrich_data=False):\n", " '''\n", " Converts link formatted Timemap to JSON.\n", " '''\n", " data = []\n", " for line in results.splitlines():\n", " parts = line.split('; ')\n", " if len(parts) > 1:\n", " link_type = re.search(r'rel=\"(original|self|timegate|first memento|last memento|memento)\"', parts[1]).group(1)\n", " if link_type == 'memento':\n", " link = parts[0].strip('<>')\n", " timestamp, original = re.search(r'/(\\d{14})/(.*)$', link).groups()\n", " capture = {'timestamp': timestamp, 'url': original}\n", " if enrich_data:\n", " capture.update(get_capture_data_from_memento(link))\n", " data.append(capture)\n", " return data\n", " \n", "def get_timemap_as_json(timegate, url):\n", " '''\n", " Get a Timemap then normalise results (if necessary) to return a list of dicts.\n", " '''\n", " tg_url = f'{TIMEGATES[timegate]}timemap/json/{url}/'\n", " response = requests.get(tg_url)\n", " response_type = response.headers['content-type']\n", " # pywb style Timemap\n", " if response_type == 'text/x-ndjson':\n", " data = [json.loads(line) for line in response.text.splitlines()]\n", " # IA Wayback stype Timemap\n", " elif response_type == 'application/json':\n", " data = convert_lists_to_dicts(response.json())\n", " # Link style Timemap (OpenWayback)\n", " elif response_type in ['application/link-format', 'text/html;charset=utf-8']:\n", " data = convert_link_to_json(response.text)\n", " return data\n", "\n", "def process_text(html):\n", " '''\n", " Extract text from an HTML page and return it as a list of lines.\n", " Removes blank lines.\n", " '''\n", " lines = [l for l in BeautifulSoup(html).get_text().splitlines() if not re.match(r'^\\s*$', l)]\n", " return lines\n", "\n", "def format_date_link(url):\n", " date = format_date(url)\n", " return f'{date}'\n", "\n", "def show_line_differences(context=True, numlines=0):\n", " '''\n", " Use difflib to show a side-by-side comparison of the text in two web pages.\n", " '''\n", " differ = HtmlDiff()\n", " doc1 = process_text(html_data[0]['html'])\n", " doc2 = process_text(html_data[1]['html'])\n", " date1 = format_date_link(html_data[0]['url'])\n", " date2 = format_date_link(html_data[1]['url'])\n", " html = differ.make_table(doc1, doc2, context=context, numlines=numlines, fromdesc=date1, todesc=date2)\n", " # Rewrite the table html to make the column widths work better\n", " html = html.replace(r'', '').replace('', '')\n", " with out:\n", " display(HTML(html))\n", "\n", "def show_all_differences(timegate, url):\n", " '''\n", " Get all captures for a page from a Timemap, then compare each page with its predecessor,\n", " display changes side-by-side.\n", " '''\n", " global html_data\n", " timemap = get_timemap_as_json(timegate, url)\n", " with out:\n", " key = 'Key'\n", " display(HTML(key))\n", " for i, capture in enumerate(timemap):\n", " capture_url = f'{TIMEGATES[timegate]}{capture[\"timestamp\"]}id_/{capture[\"url\"]}'\n", " if timegate == 'nlnz' or (capture['digest'] != timemap[i-1]['digest'] and capture['status'] == '200'):\n", " capture_data = get_html(capture_url)\n", " if capture_data:\n", " html_data.append(capture_data)\n", " if len(html_data) == 2:\n", " # You could change the params below to show context around changes\n", " # context=False -- shows the whole document\n", " # numlines -- when context=True, the number of lines to show around the diff \n", " # numline=0 -- just the diffs\n", " show_line_differences(context=True, numlines=0)\n", "\n", "def share_this():\n", " binder_url = 'https://mybinder.org/v2/gh/GLAM-Workbench/web-archives/master?urlpath=/apps/display-text-changes-from-timemap.ipynb'\n", " parameter_string = quote(f'?url=\"{target_url.value}\"&archive=\"{repository.value}\"')\n", " share_url = f'{binder_url}{parameter_string}'\n", " with out:\n", " display(HTML(f'

Share this: {share_url}

'))\n", "\n", "def start(e):\n", " clear('e')\n", " show_all_differences(repository.value, target_url.value)\n", " share_this()\n", " \n", "\n", "def clear(e):\n", " global html_data\n", " html_data.clear()\n", " out.clear_output()\n", " \n", "out = widgets.Output()\n", "\n", "repository = widgets.Dropdown(\n", " options=[('---', ''), ('UK Web Archive', 'bl'), ('National Library of Australia', 'nla'), ('National Library of New Zealand', 'nlnz'), ('Internet Archive', 'ia')],\n", " description='Archive:',\n", " disabled=False,\n", " value=archive\n", ")\n", "\n", "target_url = widgets.Text(description='URL:', value=url)\n", "\n", "tc_button = widgets.Button(description='Show text changes', button_style='primary')\n", "tc_button.on_click(start)\n", "clear_button = widgets.Button(description='Clear all')\n", "clear_button.on_click(clear)\n", "\n", "display(widgets.HBox([repository, target_url], layout=widgets.Layout(padding='10px')))\n", "display(widgets.HBox([tc_button, clear_button], layout=widgets.Layout(padding='10px')))\n", "display(out)\n", "\n", "if archive and url:\n", " start('e')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "----\n", "Created by [Tim Sherratt](https://timsherratt.org) for the [GLAM Workbench](https://glam-workbench.github.io).\n", "\n", "Work on this notebook was supported by the [IIPC Discretionary Funding Programme 2019-2020](http://netpreserve.org/projects/)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }