{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Compare two versions of an archived web page\n",
    "\n",
    "This notebook demonstrates a number of different ways of comparing versions of archived web pages. Just choose a repository, enter a url, and select two dates to see comparisons based on:\n",
    "\n",
    "* page metadata\n",
    "* basic statistics such as file size and number of words\n",
    "* numbers of internal and external links\n",
    "* cosine similarity of text\n",
    "* line by line differences in text or code\n",
    "* screenshots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>.diff_add {background-color: #d0e9c6;}.diff_sub {background-color: #ebcccc;} table.diff, table.diff thead {border: 1px solid black;} table.diff {table-layout: fixed; width: 100%;} th.diff_next, td.diff_next {width: 4%;} table.diff th.diff_header {text-align: left;} td {word-wrap: break-word;}</style>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from notebook.notebookapp import list_running_servers\n",
    "import requests\n",
    "from requests.compat import urljoin\n",
    "import json\n",
    "import os\n",
    "import ipykernel\n",
    "from difflib import HtmlDiff\n",
    "from IPython.display import display, HTML\n",
    "import re\n",
    "import arrow\n",
    "from bs4 import BeautifulSoup, Tag\n",
    "import ipywidgets as widgets\n",
    "from selenium import webdriver\n",
    "import selenium\n",
    "from PIL import Image\n",
    "import PIL\n",
    "import io\n",
    "import math\n",
    "import base64\n",
    "import time\n",
    "from slugify import slugify\n",
    "from webdriverdownloader import GeckoDriverDownloader\n",
    "from pathlib import Path\n",
    "from urllib.parse import urlparse\n",
    "import trafilatura as tf\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "import pandas as pd\n",
    "import jsons\n",
    "import ipywidgets as widgets\n",
    "from urllib.parse import quote\n",
    "\n",
    "gdd = GeckoDriverDownloader()\n",
    "geckodriver = gdd.download_and_install(\"v0.26.0\")[1]\n",
    "\n",
    "# Add styles for the diff\n",
    "HTML('<style>.diff_add {background-color: #d0e9c6;}.diff_sub {background-color: #ebcccc;} table.diff, table.diff thead {border: 1px solid black;} table.diff {table-layout: fixed; width: 100%;} th.diff_next, td.diff_next {width: 4%;} table.diff th.diff_header {text-align: left;} td {word-wrap: break-word;}</style>')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Parameters:\n",
    "url1 = ''\n",
    "url2 = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/javascript": [
       "// This is necessary in Jupyter notebook to stop the output area folding up\n",
       "// Will give a an error in Jupyter Lab that can be safely ignored\n",
       "IPython.OutputArea.prototype._should_scroll = function(lines) {return false}\n"
      ],
      "text/plain": [
       "<IPython.core.display.Javascript object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%%javascript\n",
    "// This is necessary in Jupyter notebook to stop the output area folding up\n",
    "// Will give a an error in Jupyter Lab that can be safely ignored\n",
    "IPython.OutputArea.prototype._should_scroll = function(lines) {return false}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<p>This notebook was designed to run in Appmode (with all the code hidden).</p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<a style=\"padding: 5px; display:block; width: 200px; text-align: center; background-color: #2196F3; color: white;\" href=\"/apps/show_diffs.ipynb\">Launch in appmode</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "def create_appmode_link():\n",
    "    '''\n",
    "    This creates a link to run the current notebook in appmode.\n",
    "    This is mainly useful in Jupyter Lab where appmode won't work.\n",
    "    Clicking on the link in Lab will take you into the classic notebook environment and run appmode.\n",
    "    Modified from: https://github.com/jupyter/notebook/issues/1000\n",
    "    '''\n",
    "    try:\n",
    "        import appmode\n",
    "    except ImportError:\n",
    "        pass\n",
    "    else:\n",
    "        kernel_id = re.search('kernel-(.*).json', ipykernel.connect.get_connection_file()).group(1)\n",
    "        servers = list_running_servers()\n",
    "        for ss in servers:\n",
    "            response = requests.get(urljoin(ss['url'], 'api/sessions'), params={'token': ss.get('token', '')})\n",
    "            for nn in json.loads(response.text):\n",
    "                if nn['kernel']['id'] == kernel_id:\n",
    "                    relative_path = nn['notebook']['path']\n",
    "                    # Notebooks running in appmode are prefixed with '.' so if it's already in appmode we'll do nothing\n",
    "                    if relative_path[0] != '.':\n",
    "                        url = os.path.join(ss['base_url'], 'apps', relative_path)\n",
    "                        display(HTML('<p>This notebook was designed to run in Appmode (with all the code hidden).</p>'))\n",
    "                        display(HTML('<a style=\"padding: 5px; display:block; width: 200px; text-align: center; background-color: #2196F3; color: white;\" href=\"{}\">Launch in appmode</a>'.format(url)))\n",
    "                    else:\n",
    "                        display(HTML('<p>This is a Jupyter notebook running in <a href=\"https://glam-workbench.github.io/getting-started/#running-notebooks-as-apps\">App mode</a> (with all the code hidden). To view and edit the code, click on the <b>Edit App</b> button.</p>'))\n",
    "\n",
    "create_appmode_link()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def is_memento(url):\n",
    "    '''\n",
    "    Is this url a Memento? Checks for the presence of a timestamp.\n",
    "    '''\n",
    "    return bool(re.search(r'/\\d{14}(?:id_|mp_|if_)*/http', url))\n",
    "\n",
    "def get_timestamp(url):\n",
    "    '''\n",
    "    Extract the timestamp from a Memento\n",
    "    '''\n",
    "    return re.search(r'/(\\d{14})(?:if_|mp_|id_)*/', url).group(1)\n",
    "\n",
    "def get_dates(page_data):\n",
    "    '''\n",
    "    Return formatted dates of the saved pages.\n",
    "    '''\n",
    "    dates = []\n",
    "    for capture in page_data:\n",
    "        dates.append(format_date_from_timestamp(capture['url']))\n",
    "    return dates\n",
    "\n",
    "def get_html(url):\n",
    "    '''\n",
    "    Retrieve the original HTML content of an archived page.\n",
    "    Follow redirects if they go to another archived page.\n",
    "    Return the (possibly redirected) url from the response and the HTML content.\n",
    "    '''\n",
    "    # Adding the id_ hint tells the archive to give us the original harvested version, without any rewriting.\n",
    "    url = re.sub(r'/(\\d{14})(?:mp_)*/http', r'/\\1id_/http', url)\n",
    "    response = requests.get(url, allow_redirects=True)\n",
    "    # Some captures might redirect themselves to live versions\n",
    "    # If the redirected url doesn't look like a Memento rerun this without redirection\n",
    "    if not is_memento(response.url):\n",
    "        response = requests.get(url, allow_redirects=False)\n",
    "    return {'url': response.url, 'html': response.content}\n",
    "\n",
    "def get_all_text(capture_data):\n",
    "    '''\n",
    "    Get all the human visible text from a web page, including headers, footers, and navigation.\n",
    "    Does some cleaning up to remove multiple spaces, tabs, and newlines.\n",
    "    ''' \n",
    "    try:\n",
    "        text = BeautifulSoup(capture_data['html']).get_text()\n",
    "    except TypeError:\n",
    "        return None\n",
    "    else:\n",
    "        # Remove multiple newlines\n",
    "        text = re.sub(r'\\n\\s*\\n', '\\n\\n', text)\n",
    "        # Remove multiple spaces or tabs with a single space\n",
    "        text = re.sub(r'( |\\t){2,}', ' ', text)\n",
    "        # Remove leading spaces\n",
    "        text = re.sub(r'\\n ', '\\n', text)\n",
    "        # Remove leading newlines\n",
    "        text = re.sub(r'^\\n*', '', text)\n",
    "        return text\n",
    "\n",
    "def get_main_text(capture_data):\n",
    "    '''\n",
    "    Get only the main text from a page, excluding boilerplate and navigation.\n",
    "    '''\n",
    "    text = tf.extract(capture_data['html'])\n",
    "    return text\n",
    "\n",
    "def load_data(urls):\n",
    "    '''\n",
    "    Load all the content of the specified urls into the page_data list.\n",
    "    Add in the text and main text.\n",
    "    '''\n",
    "    global page_data\n",
    "    for i, url in enumerate(urls):\n",
    "        # Get the HTML of the archives page\n",
    "        page_data.append(get_html(url))\n",
    "    for capture in page_data:\n",
    "        # Add the human-readable text\n",
    "        capture['text'] = get_all_text(capture)\n",
    "        # Add the main text\n",
    "        capture['main_text'] = get_main_text(capture)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# METADATA\n",
    "\n",
    "def get_page_metadata(html):\n",
    "    '''\n",
    "    Get the metadata from a page extracted by Trafilatura\n",
    "    '''\n",
    "    return jsons.dump(tf.metadata.extract_metadata(html))\n",
    "\n",
    "def get_metadata(page_data):\n",
    "    '''\n",
    "    Get metadata from all saved pages.\n",
    "    '''\n",
    "    metadata = []\n",
    "    for capture in page_data:\n",
    "        metadata.append(get_page_metadata(capture['html']))\n",
    "    return metadata\n",
    "\n",
    "def display_metadata(page_data):\n",
    "    '''\n",
    "    Display the extracted metadata.\n",
    "    '''\n",
    "    metadata = get_metadata(page_data)\n",
    "    # Get the formatted dates of the two pages\n",
    "    dates = get_dates(page_data)\n",
    "    # Use Pandas to make nice tables, using the dates as index\n",
    "    df = pd.DataFrame(metadata, index=dates)\n",
    "    with md_out:\n",
    "        display(HTML('<hr><h2>Metadata</h2>'))\n",
    "        display(df)\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# STATISTICS \n",
    "\n",
    "def size_in_bytes(html):\n",
    "    '''\n",
    "    The HTML should already be a bytes string, so len should give us the number of bytes.\n",
    "    '''\n",
    "    return len(html)\n",
    "\n",
    "def number_of_words(text):\n",
    "    '''\n",
    "    Split text on whitespace and count resulting words.\n",
    "    (Might include some punctuation as well.)\n",
    "    '''\n",
    "    try:\n",
    "        words = len(text.split())\n",
    "    except AttributeError:\n",
    "        words = 0\n",
    "    return words\n",
    "\n",
    "def get_summary_data(capture):\n",
    "    '''\n",
    "    Compile some summary statistics about a page.\n",
    "    '''\n",
    "    summary = {\n",
    "        'File size (bytes)': size_in_bytes(capture['html']),\n",
    "        'Number of words (all text)': number_of_words(capture['text']),\n",
    "        'Number of words (main text)': number_of_words(capture['main_text'])\n",
    "    }\n",
    "    return summary\n",
    "\n",
    "def get_summaries(page_data):\n",
    "    '''\n",
    "    Get summaries of all the saved pages.\n",
    "    '''\n",
    "    summaries = []\n",
    "    for capture in page_data:\n",
    "        summaries.append(get_summary_data(capture))\n",
    "    return summaries\n",
    "    \n",
    "def display_summaries(page_data):\n",
    "    '''\n",
    "    Display the summaries using Pandas.\n",
    "    '''\n",
    "    summaries = get_summaries(page_data)\n",
    "    dates = get_dates(page_data)\n",
    "    df = pd.DataFrame(summaries, index=dates)\n",
    "    with stats_out:\n",
    "        display(HTML('<hr><h2>Statistics</h2>'))\n",
    "        # Include thousands separator\n",
    "        display(df.head().style.format(\"{:,.0f}\"))\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# LINKS\n",
    "\n",
    "def link_is_local(site, href):\n",
    "    '''\n",
    "    Check to see if a link is internal or external by looking to see if it includes the current domain.\n",
    "    '''\n",
    "    # Relative urls will always be local of course\n",
    "    if href.startswith('http') and site not in href.lower():\n",
    "        return False\n",
    "    else:\n",
    "        return True\n",
    "    \n",
    "def get_site_from_url(url):\n",
    "    # Get the current domain from the url\n",
    "    site = re.search(r'\\d{14}(?:id_)*/https*://(?:.*@){0,1}(.*?)(?:\\:\\d*){0,1}/', url).group(1)\n",
    "    # Remove any wwws\n",
    "    site = re.sub(r'^www\\d*\\.', '', site)\n",
    "    return site\n",
    "\n",
    "def get_links_in_page(capture):\n",
    "    '''\n",
    "    Extract internal and external links from a html page.\n",
    "    '''\n",
    "    internal_links = []\n",
    "    external_links = []\n",
    "    site = get_site_from_url(capture['url'])\n",
    "    soup = BeautifulSoup(capture['html'])\n",
    "    links = soup.find_all('a')\n",
    "    for link in links:\n",
    "        try:\n",
    "            href = link['href']\n",
    "        except KeyError:\n",
    "            pass\n",
    "        else:\n",
    "            if link_is_local(site, href):\n",
    "                if href not in internal_links:\n",
    "                    internal_links.append(href)\n",
    "            else:\n",
    "                if href not in external_links:\n",
    "                    external_links.append(href)\n",
    "    return {'internal': internal_links, 'external': external_links}\n",
    "\n",
    "def get_links(page_data):\n",
    "    '''\n",
    "    Extract link info from all saved pages.\n",
    "    '''\n",
    "    all_links = []\n",
    "    for capture in page_data:\n",
    "        links = get_links_in_page(capture)\n",
    "        all_links.append(links)\n",
    "    return all_links\n",
    "\n",
    "def display_link_data(dates, all_links):\n",
    "    '''\n",
    "    Display the number of links in saved pages.\n",
    "    '''\n",
    "    totals = []\n",
    "    for links in all_links:\n",
    "        totals.append({'Total internal links': len(links['internal']), 'Total external links': len(links['external'])})\n",
    "    df = pd.DataFrame(totals, index=dates)\n",
    "    display(df)\n",
    "    \n",
    "def make_clickable(val):\n",
    "    '''\n",
    "    Make the value of a Pandas cell into a clickable link.\n",
    "    '''\n",
    "    return f'<a href=\"{val}\">{val}</a>' if val != None else ''\n",
    "    \n",
    "def list_external_links(dates, all_links):\n",
    "    '''\n",
    "    Display a list of external links using Pandas.\n",
    "    '''\n",
    "    # Put links into a dataframe, then transpose to make dates into columns\n",
    "    df = pd.DataFrame([l['external'] for l in all_links], index=dates).T\n",
    "    # Make links clickable and align left\n",
    "    df_styler = df.style.format(make_clickable).set_properties(**{'text-align': 'left'})\n",
    "    # Make the headers left aligned as well\n",
    "    df_styler.set_table_styles([dict(selector='th', props=[('text-align', 'left')])])\n",
    "    # Display without the index\n",
    "    display(df_styler.hide_index())\n",
    "    \n",
    "def display_links(page_data):\n",
    "    '''\n",
    "    Extract and display information about links in the saved pages.\n",
    "    '''\n",
    "    all_links = get_links(page_data)\n",
    "    dates = get_dates(page_data)\n",
    "    with links_out:\n",
    "        display(HTML('<hr><h2>Links</h2>'))\n",
    "        display_link_data(dates, all_links)\n",
    "        display(HTML('<h4>External links</h4>'))\n",
    "        list_external_links(dates, all_links)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SIMILARITY\n",
    "\n",
    "def calculate_similarity(text1, text2):\n",
    "    '''\n",
    "    Calculate cosine similarity of two texts.\n",
    "    '''\n",
    "    try:\n",
    "        tfidf = TfidfVectorizer(min_df=1).fit_transform([text1, text2])\n",
    "    except AttributeError:\n",
    "        return None\n",
    "    return (tfidf * tfidf.T).A[0][1]\n",
    "\n",
    "def calculate_similarities(page_data):\n",
    "    '''\n",
    "    Calculate cosine similarities for all the text, and the main text only, of the saved pages.\n",
    "    '''\n",
    "    similarities = {\n",
    "        'All text': calculate_similarity(page_data[0]['text'], page_data[1]['text']),\n",
    "        'Main text': calculate_similarity(page_data[0]['main_text'], page_data[1]['main_text'])\n",
    "    }\n",
    "    return similarities\n",
    "\n",
    "def display_similarities(page_data):\n",
    "    '''\n",
    "    Display the similarity values.\n",
    "    '''\n",
    "    similarities = calculate_similarities(page_data)\n",
    "    df = pd.DataFrame([similarities], index=['Cosine similarity']).T\n",
    "    with sim_out:\n",
    "        display(HTML('<hr><h2>Cosine similarity</h2>'))\n",
    "        display(df)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# DIFFERENCES \n",
    "\n",
    "def process_text(capture, include='text'):\n",
    "    '''\n",
    "    Prepare extracted text for diffing, by splitting into lines, and removing any blank lines.\n",
    "    '''\n",
    "    if include == 'text':\n",
    "        lines = [l.strip() for l in BeautifulSoup(capture['html']).get_text().splitlines() if not re.match(r'^\\s*$', l)]\n",
    "        #lines = capture['text'].splitlines()\n",
    "    elif include == 'main_text':\n",
    "        lines = capture['main_text'].splitlines()\n",
    "    else:\n",
    "        lines = [l.decode() for l in capture['html'].splitlines()]\n",
    "    return lines\n",
    "\n",
    "def format_date_link(url):\n",
    "    date = format_date_from_timestamp(url)\n",
    "    return f'<a href=\"{url}\">{date}</a>'\n",
    "\n",
    "def show_line_differences(page_data, include='text', context=True, numlines=0):\n",
    "    '''\n",
    "    Use difflib to show a side-by-side comparison of the text in two web pages.\n",
    "    '''\n",
    "    differ = HtmlDiff()\n",
    "    doc1 = process_text(page_data[0], include=include)\n",
    "    doc2 = process_text(page_data[1], include=include)\n",
    "    date1 = format_date_link(page_data[0]['url'])\n",
    "    date2 = format_date_link(page_data[1]['url'])\n",
    "    html = differ.make_table(doc1, doc2, context=context, numlines=numlines, fromdesc=date1, todesc=date2)\n",
    "    # Rewrite the table html to make the column widths work better\n",
    "    html = html.replace(r'<th colspan=\"2\" class=\"diff_header\"', '<th class=\"diff_next\"></th><th class=\"diff_header\"')\n",
    "    # Cleaning up the table output\n",
    "    html = html.replace('nowrap=\"nowrap\"', '')\n",
    "    html = html.replace('<tbody>', '').replace('</tbody>', '')\n",
    "    with diff_out:\n",
    "        display(HTML(html))\n",
    "    \n",
    "def display_diff(e):\n",
    "    '''\n",
    "    Update the diff display when the drop downs selection change.\n",
    "    '''\n",
    "    diff_out.clear_output(wait=True)\n",
    "    which_text.observe(display_diff)\n",
    "    what_context.observe(display_diff)\n",
    "    with diff_out:\n",
    "        display(HTML('<hr><h2>Differences by line</h2>'))\n",
    "        display(widgets.HBox([which_text, what_context]))\n",
    "    show_line_differences(page_data, include=which_text.value, context=what_context.value)\n",
    "    \n",
    "which_text = widgets.Dropdown(\n",
    "    options=[('All text', 'text'), ('Main text', 'main_text'), ('Complete html', 'html')],\n",
    "    description='Compare:',\n",
    "    disabled=False,\n",
    ")\n",
    "\n",
    "what_context = widgets.Dropdown(\n",
    "    options=[('Just changes', True), ('Complete context', False)],\n",
    "    description='Context:',\n",
    "    disabled=False,\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SCREENSHOTS\n",
    "\n",
    "wayback = ['ndhadeliver.natlib.govt.nz', 'web.archive.org']\n",
    "pywb = {'web.archive.org.au': 'replayFrame', 'webarchive.nla.gov.au': 'replayFrame', 'webarchive.org.uk': 'replay_iframe'}\n",
    "\n",
    "def get_full_page_screenshot(url, save_width=200):\n",
    "    '''\n",
    "    Gets a full page screenshot of the supplied url.\n",
    "    By default resizes the screenshot to a maximum width of 200px.\n",
    "    Provide a 'save_width' value to change this.\n",
    "    \n",
    "    NOTE the webdriver sometimes fails for unknown reasons. Just try again.\n",
    "    '''\n",
    "    domain = urlparse(url)[1].replace('www.', '')\n",
    "    # NZ and IA inject content into the page, so we use if_ to get the original page (with rewritten urls)\n",
    "    if domain in wayback and 'if_' not in url:\n",
    "        url = re.sub(r'/(\\d{14})/http', r'/\\1if_/http', url)\n",
    "    try:\n",
    "        date_str, site = re.search(r'/(\\d{14})(?:if_|mp_)*/https*://(.+/)', url).groups()\n",
    "    except AttributeError:\n",
    "        # There's something wrong with the link...\n",
    "        # print(url)\n",
    "        show_error(f'{url} isn\\'t a Memento – did you forget to select an archive?')\n",
    "    else:\n",
    "        output_dir = Path('screenshots')\n",
    "        output_dir.mkdir(parents=True, exist_ok=True)\n",
    "        ss_file = Path(output_dir, f'{slugify(site)}-{date_str}-{save_width}.png')\n",
    "        options = webdriver.FirefoxOptions()\n",
    "        options.headless = True\n",
    "        driver = webdriver.Firefox(executable_path=geckodriver, options=options)\n",
    "        driver.implicitly_wait(15)\n",
    "        driver.get(url)\n",
    "        # Give some time for everything to load\n",
    "        time.sleep(30)\n",
    "        driver.maximize_window()\n",
    "        current_width = driver.get_window_size()['width']\n",
    "        # UK and AU use pywb in framed replay mode, so we need to switch to the framed content\n",
    "        if domain in pywb:\n",
    "            try:\n",
    "                driver.switch_to.frame(pywb[domain])\n",
    "            except selenium.common.exceptions.NoSuchFrameException:\n",
    "                # If we pass here we'll probably still get a ss, just not full page -- better than failing?\n",
    "                pass\n",
    "        ss = None\n",
    "        for tag in ['body', 'html', 'frameset']:\n",
    "            try:\n",
    "                elem = driver.find_element_by_tag_name(tag)\n",
    "                ss = elem.screenshot_as_base64\n",
    "                break\n",
    "            except (selenium.common.exceptions.NoSuchElementException, selenium.common.exceptions.WebDriverException):\n",
    "                pass\n",
    "        driver.quit()\n",
    "        if not ss:\n",
    "            show_error(f'Couldn\\'t get a screenshot of {url} – sorry...')\n",
    "        else:\n",
    "            img = Image.open(io.BytesIO(base64.b64decode(ss)))\n",
    "            ratio = save_width / img.width\n",
    "            (width, height) = (save_width, math.ceil(img.height * ratio))\n",
    "            resized_img = img.resize((width, height), PIL.Image.LANCZOS)\n",
    "            resized_img.save(ss_file)\n",
    "            return ss_file\n",
    "\n",
    "status = widgets.Output()\n",
    "        \n",
    "def display_screenshots(urls):\n",
    "    html_output = []\n",
    "    with ss_out:\n",
    "        display(HTML('<hr><h2>Screenshots</h2>'))\n",
    "        display(status)\n",
    "    for url in urls:\n",
    "        with status:\n",
    "            print('Generating screenshot...')\n",
    "        try:\n",
    "            ss_file = get_full_page_screenshot(url, save_width=350)\n",
    "            if ss_file:\n",
    "                date = format_date_from_timestamp(url)\n",
    "                try:\n",
    "                    display_url = re.search(r'/\\d{14}(?:mp_|if_|id_)*/(.*)$', url).group(1)\n",
    "                except AttributeError:\n",
    "                    display_url = url\n",
    "                html_output.append(f'<div style=\"float:left; margin-left: 20px;\"><p><b>{date}</b><br><a href=\"{url.replace(\"if_/\", \"/\")}\">{display_url}</a></p><p><a href=\"{ss_file}\"><img src=\"{ss_file}\"></a><br><a href=\"{ss_file}\">[Download]</a></p></div>')\n",
    "                status.clear_output()\n",
    "                ss_out.clear_output(wait=True)\n",
    "                with ss_out:\n",
    "                    display(HTML('<hr><h2>Screenshots</h2>'))\n",
    "                    display(status)\n",
    "                    display((HTML(''.join(html_output))))\n",
    "        except selenium.common.exceptions.WebDriverException:\n",
    "            show_error(f'couldn\\'t get a screenshot of {url} – sorry...')\n",
    "\n",
    "def show_error(message=None):\n",
    "    status.clear_output()\n",
    "    with status:\n",
    "        print(f'Something went wrong – {message}')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# USER INTERFACE\n",
    "\n",
    "page_data = []\n",
    "\n",
    "TIMEGATES = {\n",
    "    'nla': 'https://web.archive.org.au/awa/',\n",
    "    'nlnz': 'https://ndhadeliver.natlib.govt.nz/webarchive/wayback/',\n",
    "    'bl': 'https://www.webarchive.org.uk/wayback/archive/',\n",
    "    'ia': 'https://web.archive.org/web/'\n",
    "}\n",
    "\n",
    "def format_date_for_headers(iso_date, tz):\n",
    "    '''\n",
    "    Convert an ISO date (YYYY-MM-DD) to a datetime at noon in the specified timezone.\n",
    "    Convert the datetime to UTC and format as required by Accet-Datetime headers:\n",
    "    eg Fri, 23 Mar 2007 01:00:00 GMT\n",
    "    '''\n",
    "    local = arrow.get(f'{iso_date} 12:00:00 {tz}', 'YYYY-MM-DD HH:mm:ss ZZZ')\n",
    "    gmt = local.to('utc')\n",
    "    return f'{gmt.format(\"ddd, DD MMM YYYY HH:mm:ss\")} GMT'\n",
    "\n",
    "def format_date_from_timestamp(url):\n",
    "    timestamp = re.search(r'/(\\d{14})(?:if_|mp_|id_)*/', url).group(1)\n",
    "    return arrow.get(timestamp, 'YYYYMMDDHHmmss').format('D MMMM YYYY')\n",
    "\n",
    "def parse_links_from_headers(response):\n",
    "    '''\n",
    "    Extract original, timegate, timemap, and memento links from 'Link' header.\n",
    "    '''\n",
    "    links = response.links\n",
    "    return {k: v['url'] for k, v in links.items()}\n",
    "\n",
    "def query_timegate(timegate, url, date=None, tz='Australia/Canberra'):\n",
    "    '''\n",
    "    Query the specified repository for a Memento.\n",
    "    '''\n",
    "    headers = {}\n",
    "    if date:\n",
    "        formatted_date = format_date_for_headers(date, tz)\n",
    "        headers['Accept-Datetime'] = formatted_date\n",
    "    # BL & NLNZ don't seem to default to latest date if no date supplied\n",
    "    elif not date and timegate in ['bl', 'nlnz']:\n",
    "        formatted_date = format_date_for_headers(arrow.utcnow().format('YYYY-MM-DD'), tz)\n",
    "        headers['Accept-Datetime'] = formatted_date\n",
    "    # Note that you don't get a timegate response if you leave off the trailing slash, but extras don't hurt!\n",
    "    tg_url = f'{TIMEGATES[timegate]}{url}/' if not url.endswith('/') else f'{TIMEGATES[timegate]}{url}'\n",
    "    # print(tg_url)\n",
    "    # IA only works if redirects are followed -- this defaults to False with HEAD requests...\n",
    "    if timegate == 'ia':\n",
    "        allow_redirects = True\n",
    "    else:\n",
    "        allow_redirects = False\n",
    "    response = requests.head(tg_url, headers=headers, allow_redirects=allow_redirects)\n",
    "    return parse_links_from_headers(response)\n",
    "\n",
    "def get_memento(timegate, url, date=None, tz='Australia/Canberra'):\n",
    "    '''\n",
    "    If there's no memento in the results, look for an alternative.\n",
    "    '''\n",
    "    links = query_timegate(timegate, url, date, tz)\n",
    "    # NLNZ doesn't always seem to return a Memento, so we'll build in some fuzziness\n",
    "    if links:\n",
    "        if 'memento' in links:\n",
    "            memento = links['memento']\n",
    "        elif 'prev memento' in links:\n",
    "            memento = links['prev memento']\n",
    "        elif 'next memento' in links:\n",
    "            memento = links['next memento']\n",
    "        elif 'last memento' in links:\n",
    "            memento = links['last memento']\n",
    "    else:\n",
    "        memento = None\n",
    "    return memento\n",
    "\n",
    "def get_mementos():\n",
    "    mementos = [\n",
    "        get_memento(repository.value, target_url.value, first_date.value),\n",
    "        get_memento(repository.value, target_url.value, second_date.value)\n",
    "    ]\n",
    "    return mementos\n",
    "\n",
    "def share_this(urls):\n",
    "    binder_url = 'https://mybinder.org/v2/gh/GLAM-Workbench/web-archives/master?urlpath=/apps/show_diffs.ipynb'\n",
    "    parameter_string = quote(f'?url1=\"{urls[0]}\"&url2=\"{urls[1]}\"')\n",
    "    share_url = f'{binder_url}{parameter_string}'\n",
    "    with share_out:\n",
    "        display(HTML(f'<p>Share this: <a href=\"{share_url}\">{share_url}</a></p>'))\n",
    "        \n",
    "def clear(e):\n",
    "    global page_data\n",
    "    page_data = []\n",
    "    md_out.clear_output()\n",
    "    stats_out.clear_output()\n",
    "    links_out.clear_output()\n",
    "    sim_out.clear_output()\n",
    "    diff_out.clear_output()\n",
    "    ss_out.clear_output()\n",
    "    share_out.clear_output()\n",
    "\n",
    "def start(e):\n",
    "    clear('e')\n",
    "    if url1 and url2:\n",
    "        urls = [url1, url2]\n",
    "    else:\n",
    "        urls = get_mementos()\n",
    "    load_data(urls)\n",
    "    display_metadata(page_data)\n",
    "    display_summaries(page_data)\n",
    "    display_links(page_data)\n",
    "    display_similarities(page_data)\n",
    "    display_diff('e')\n",
    "    display_screenshots(urls)\n",
    "    share_this(urls)\n",
    "    \n",
    "md_out = widgets.Output()\n",
    "stats_out = widgets.Output()\n",
    "links_out = widgets.Output()\n",
    "sim_out = widgets.Output()\n",
    "diff_out = widgets.Output()\n",
    "ss_out = widgets.Output()\n",
    "share_out = widgets.Output()\n",
    "\n",
    "if url1 and url1:\n",
    "    memento1 = widgets.Text(value=url1, layout=widgets.Layout(width='400px'))\n",
    "    memento2 = widgets.Text(value=url2, layout=widgets.Layout(width='400px'))\n",
    "    display(\n",
    "        widgets.HBox([\n",
    "            widgets.VBox([widgets.Label('First memento:'), \n",
    "                          widgets.Label('Second memento:')\n",
    "                         ]), \n",
    "            widgets.VBox([memento1, memento2])], layout=widgets.Layout(padding='20px')\n",
    "        )\n",
    "    )\n",
    "    display(md_out, stats_out, links_out, sim_out, diff_out, ss_out, share_out)\n",
    "    start('e')\n",
    "else:\n",
    "    repository = widgets.Dropdown(\n",
    "        options=[('---', ''), ('UK Web Archive', 'bl'), ('National Library of Australia', 'nla'), ('National Library of New Zealand', 'nlnz'), ('Internet Archive', 'ia')],\n",
    "        description='Archive:',\n",
    "        disabled=False,\n",
    "    )\n",
    "\n",
    "    target_url = widgets.Text(description='Target URL:')\n",
    "\n",
    "    first_date = widgets.DatePicker(\n",
    "        description='Date 1: ',\n",
    "        disabled=False\n",
    "    )\n",
    "\n",
    "    second_date = widgets.DatePicker(\n",
    "        description='Date 2: ',\n",
    "        disabled=False\n",
    "    )\n",
    "    \n",
    "    start_button = widgets.Button(description='Start', button_style='primary')\n",
    "    start_button.on_click(start)\n",
    "    display(widgets.HBox([widgets.VBox([repository, first_date]), widgets.VBox([target_url, second_date])], layout=widgets.Layout(padding='20px')), widgets.HBox([start_button]))  \n",
    "    display(md_out, stats_out, links_out, sim_out, diff_out, ss_out, share_out)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----\n",
    "Created by [Tim Sherratt](https://timsherratt.org) for the [GLAM Workbench](https://glam-workbench.github.io).\n",
    "\n",
    "Work on this notebook was supported by the [IIPC Discretionary Funding Programme 2019-2020](http://netpreserve.org/projects/)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}