{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Get full page screenshots from archived web pages\n", "\n", "[View in GitHub](https://github.com/GLAM-Workbench/web-archives/blob/master/save_screenshot.ipynb) · [View in GLAM Workbench](https://glam-workbench.net/web-archives/#create-and-compare-full-page-screenshots-from-archived-web-pages)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# This notebook is designed to run in Voila as an app (with the code hidden).\n", "# To launch this notebook in Voila, just select 'View > Open with Voila in New Browser Tab'\n", "# Your browser might ask for permission to open the new tab as a popup." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%capture\n", "import base64\n", "import io\n", "import math\n", "import os\n", "import re\n", "import time\n", "from pathlib import Path\n", "from urllib.parse import urlparse\n", "\n", "import arrow\n", "import geckodriver_autoinstaller\n", "import ipywidgets as widgets\n", "import PIL\n", "import requests\n", "import selenium\n", "from IPython.display import HTML, display\n", "from PIL import Image\n", "from selenium import webdriver\n", "from selenium.webdriver.common.by import By\n", "from slugify import slugify\n", "\n", "geckodriver_autoinstaller.install()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "TIMEGATES = {\n", " \"nla\": \"https://web.archive.org.au/awa/\",\n", " \"nlnz\": \"https://ndhadeliver.natlib.govt.nz/webarchive/\",\n", " \"bl\": \"https://www.webarchive.org.uk/wayback/archive/\",\n", " \"ia\": \"https://web.archive.org/web/\",\n", " \"ukgwa\": \"https://webarchive.nationalarchives.gov.uk/ukgwa/\",\n", "}\n", "\n", "wayback = [\"web.archive.org\"]\n", "pywb = {\n", " \"web.archive.org.au\": \"replayFrame\",\n", " \"webarchive.nla.gov.au\": \"replayFrame\",\n", " \"webarchive.org.uk\": \"replay_iframe\",\n", " \"ndhadeliver.natlib.govt.nz\": \"replayFrame\",\n", " \"webarchive.nationalarchives.gov.uk\": \"replay_iframe\",\n", "}\n", "\n", "html_output = []\n", "\n", "\n", "def format_date_for_headers(iso_date, tz):\n", " \"\"\"\n", " Convert an ISO date (YYYY-MM-DD) to a datetime at noon in the specified timezone.\n", " Convert the datetime to UTC and format as required by Accet-Datetime headers:\n", " eg Fri, 23 Mar 2007 01:00:00 GMT\n", " \"\"\"\n", " local = arrow.get(f\"{iso_date} 12:00:00 {tz}\", \"YYYY-MM-DD HH:mm:ss ZZZ\")\n", " gmt = local.to(\"utc\")\n", " return f'{gmt.format(\"ddd, DD MMM YYYY HH:mm:ss\")} GMT'\n", "\n", "\n", "def format_date_from_timestamp(url):\n", " timestamp = re.search(r\"/(\\d{14}|\\d{12})(?:if_|mp_)*/\", url).group(1)\n", " return arrow.get(timestamp, \"YYYYMMDDHHmmss\").format(\"D MMMM YYYY\")\n", "\n", "\n", "def parse_links_from_headers(response):\n", " \"\"\"\n", " Extract original, timegate, timemap, and memento links from 'Link' header.\n", " \"\"\"\n", " links = response.links\n", " return {k: v[\"url\"] for k, v in links.items()}\n", "\n", "\n", "def query_timegate(timegate, url, date=None, tz=\"Australia/Canberra\"):\n", " headers = {}\n", " if date:\n", " formatted_date = format_date_for_headers(date, tz)\n", " headers[\"Accept-Datetime\"] = formatted_date\n", " # BL, NLNZ & UKGWA don't seem to default to latest date if no date supplied\n", " elif not date and timegate in [\"bl\", \"nlnz\", \"ukgwa\"]:\n", " formatted_date = format_date_for_headers(\n", " arrow.utcnow().format(\"YYYY-MM-DD\"), tz\n", " )\n", " headers[\"Accept-Datetime\"] = formatted_date\n", " # Note that you don't get a timegate response if you leave off the trailing slash, but extras don't hurt!\n", " tg_url = (\n", " f\"{TIMEGATES[timegate]}{url}/\"\n", " if not url.endswith(\"/\")\n", " else f\"{TIMEGATES[timegate]}{url}\"\n", " )\n", " # print(tg_url)\n", " # IA doesn't work with head, others don't work with get...\n", " if timegate == \"ia\":\n", " response = requests.get(tg_url, headers=headers)\n", " else:\n", " response = requests.head(tg_url, headers=headers)\n", " return parse_links_from_headers(response)\n", "\n", "\n", "def get_memento(timegate, url, date):\n", " links = query_timegate(timegate, url, date)\n", " # NLNZ doesn't always seem to return a Memento, so we'll build in some fuzziness\n", " if links:\n", " if \"memento\" in links:\n", " memento = links[\"memento\"]\n", " elif \"prev memento\" in links:\n", " memento = links[\"prev memento\"]\n", " elif \"next memento\" in links:\n", " memento = links[\"next memento\"]\n", " elif \"last memento\" in links:\n", " memento = links[\"last memento\"]\n", " else:\n", " memento = None\n", " return memento\n", "\n", "\n", "def get_full_page_screenshot(url, save_width=200):\n", " \"\"\"\n", " Gets a full page screenshot of the supplied url.\n", " By default resizes the screenshot to a maximum width of 200px.\n", " Provide a 'save_width' value to change this.\n", "\n", " NOTE the webdriver sometimes fails for unknown reasons. Just try again.\n", " \"\"\"\n", " global html_output\n", " domain = urlparse(url)[1].replace(\"www.\", \"\")\n", " # NZ and IA inject content into the page, so we use if_ to get the original page (with rewritten urls)\n", " if domain in wayback and \"if_\" not in url:\n", " url = re.sub(r\"/(\\d{14}|\\d{12})/http\", r\"/\\1if_/http\", url)\n", " try:\n", " date_str, site = re.search(\n", " r\"/(\\d{14}|\\d{12})(?:if_|mp_)*/https*://?(.+/)\", url\n", " ).groups()\n", " except AttributeError:\n", " # There's something wrong with the link...\n", " # print(url)\n", " show_error(f\"{url} isn't a Memento – did you forget to select an archive?\")\n", " else:\n", " output_dir = Path(\"screenshots\")\n", " output_dir.mkdir(parents=True, exist_ok=True)\n", " ss_file = Path(output_dir, f\"{slugify(site)}-{date_str}-{save_width}.png\")\n", " options = webdriver.FirefoxOptions()\n", " options.headless = True\n", " driver = webdriver.Firefox(options=options)\n", " driver.implicitly_wait(15)\n", " driver.get(url)\n", " # Give some time for everything to load\n", " time.sleep(30)\n", " driver.maximize_window()\n", " # UK and AU use pywb in framed replay mode, so we need to switch to the framed content\n", " if domain in pywb:\n", " try:\n", " driver.switch_to.frame(pywb[domain])\n", " except selenium.common.exceptions.NoSuchFrameException:\n", " # If we pass here we'll probably still get a ss, just not full page -- better than failing?\n", " pass\n", " ss = None\n", " for tag in [\"body\", \"html\", \"frameset\"]:\n", " try:\n", " elem = driver.find_element(By.TAG_NAME, tag)\n", " ss = elem.screenshot_as_base64\n", " break\n", " except (\n", " selenium.common.exceptions.NoSuchElementException,\n", " selenium.common.exceptions.WebDriverException,\n", " ):\n", " pass\n", " driver.quit()\n", " if not ss:\n", " show_error(f\"Couldn't get a screenshot of {url} – sorry...\")\n", " else:\n", " img = Image.open(io.BytesIO(base64.b64decode(ss)))\n", " ratio = save_width / img.width\n", " (width, height) = (save_width, math.ceil(img.height * ratio))\n", " resized_img = img.resize((width, height), PIL.Image.Resampling.LANCZOS)\n", " resized_img.save(ss_file)\n", " return ss_file\n", "\n", "\n", "def display_screenshot(ss_file, url):\n", " date = format_date_from_timestamp(url)\n", " try:\n", " display_url = re.search(r\"/(\\d{14}|\\d{12})(?:mp_|if_|id_)*/(.*)$\", url).group(1)\n", " except AttributeError:\n", " display_url = url\n", " status.clear_output()\n", " html_output.append(\n", " f'
{date}
{display_url}
3 December 2015
http://news.sheepdogandwolf.com/
3 December 2015
http://news.sheepdogandwolf.com/
3 December 2015
http://news.sheepdogandwolf.com/