{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Download a page image\n", "\n", "The Trove web interface doesn't provide a way of getting high-resolution page images from newspapers. This simple app lets you download page images as complete, high-resolution JPG files." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import ipywidgets as widgets\n", "import requests\n", "import datetime\n", "import arrow\n", "import random\n", "import re\n", "import shutil\n", "from collections import OrderedDict\n", "from operator import itemgetter\n", "from IPython.display import display, HTML, FileLink, clear_output\n", "\n", "titles = {}\n", "\n", "def display_button():\n", " button = widgets.Button(\n", " description='Get page image',\n", " disabled=False,\n", " button_style='primary', # 'success', 'info', 'warning', 'danger' or ''\n", " tooltip='Click to download',\n", " icon=''\n", " )\n", " button.on_click(get_page_image)\n", " display(button)\n", "\n", "def get_page_image(b):\n", " clear_output(wait=True)\n", " display_button()\n", " article = None\n", " page_id = None\n", " if article_url.value and 'page' in article_url.value:\n", " page_id = re.search(r'page\\/{0,1}(\\d+)', article_url.value).group(1)\n", " elif article_url.value:\n", " article_id = re.search(r'article\\/{0,1}(\\d+)', article_url.value).group(1)\n", " params = {\n", " 'reclevel': 'full',\n", " 'encoding': 'json',\n", " 'key': api_key.value\n", " }\n", " response = requests.get('http://api.trove.nla.gov.au/v2/newspaper/{}'.format(article_id), params=params)\n", " data = response.json()\n", " article = data['article']\n", " else:\n", " end = arrow.get(date.value)\n", " start = end.shift(days=-1)\n", " date_query = 'date:[{}Z TO {}Z]'.format(start.format('YYYY-MM-DDT00:00:00'), end.format('YYYY-MM-DDT00:00:00'))\n", "\n", " params = {\n", " 'zone': 'newspaper',\n", " 'reclevel': 'full',\n", " 'encoding': 'json',\n", " 'n': '1',\n", " 'q': '{} firstpageseq:{}'.format(date_query, page.value),\n", " 'l-title': title.value,\n", " 'key': api_key.value\n", " }\n", " response = requests.get('http://api.trove.nla.gov.au/v2/result', params=params)\n", " data = response.json()\n", " try:\n", " article = data['response']['zone'][0]['records']['article'][0]\n", " except (KeyError, IndexError):\n", " display(HTML('Page not found!'))\n", " if article:\n", " page_id = re.search(r'page\\/(\\d+)', article['trovePageUrl']).group(1)\n", " if page_id:\n", " # Construct the url we need to download the image\n", " page_url = 'http://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}'.format(page_id, size.value)\n", " # Download the page image\n", " response = requests.get(page_url, stream=True)\n", " filename = 'data/{}-level{}.jpg'.format(page_id, size.value)\n", " with open(filename, 'wb') as out_file:\n", " shutil.copyfileobj(response.raw, out_file)\n", " display(FileLink(filename))\n", " display(HTML(''.format(filename)))\n", " \n", "\n", "def get_titles(b):\n", " params = {\n", " 'encoding': 'json',\n", " 'key': api_key.value\n", " }\n", " response = requests.get('http://api.trove.nla.gov.au/newspaper/titles', params=params)\n", " data = response.json()\n", " title_list = [(t['title'], t['id']) for t in data['response']['records']['newspaper']]\n", " title_list.sort(key=itemgetter(0))\n", " titles = OrderedDict(title_list)\n", " title.options = titles" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Enter your Trove API key\n", "\n", "Get your own [Trove API key](http://help.nla.gov.au/trove/building-with-trove/api) and enter it below." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "api_key = widgets.Text(\n", " placeholder='Enter your Trove API key',\n", " description='API key:',\n", " disabled=False\n", ")\n", "display(api_key)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Either enter an article or page url...\n", "\n", "You can use the url in your browser's location bar or an article or page permalink." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "article_url = widgets.Text(\n", " placeholder='Enter an article or page url',\n", " description='Article/Page:',\n", " disabled=False\n", ")\n", "display(article_url)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Or provide a date, newspaper title, and page number\n", "\n", "If you've provided a url above these settings will be ignored." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "date = widgets.DatePicker(\n", " description='Date:',\n", " disabled=False\n", ")\n", "display(date)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "page = widgets.IntText(\n", " value=1,\n", " description='Page:',\n", " disabled=False\n", ")\n", "display(page)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "title = widgets.Dropdown(\n", " options=['Click the button to load titles'],\n", " description='Newspaper:',\n", " disabled=False,\n", " )\n", "titles_button = widgets.Button(\n", " description='Load titles',\n", " disabled=False,\n", " button_style='', # 'success', 'info', 'warning', 'danger' or ''\n", " tooltip='Click to load titles',\n", " icon=''\n", " )\n", "titles_button.on_click(get_titles)\n", "display(widgets.HBox([title, titles_button]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Choose the resolution" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Page images are available in seven resolutions that correspond to the zoom levels in the Trove web interface. As a rough guide:\n", "\n", "* size 1 is around 900 x 1200 px (500kb)\n", "* size 4 is around 2700 x 3500 px (3mb)\n", "* size 7 is around 6100 x 7800 px (7mb)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "size = widgets.BoundedIntText(\n", " min=1,\n", " max=7,\n", " value=4,\n", " description='Size:',\n", " disabled=False\n", ")\n", "display(size)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get the image!" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "display_button()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }