{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# QueryPic deconstructed\n", "#### Visualise searches in Trove's digitised newspapers" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[QueryPic](http://dhistory.org/querypic/) is a tool I created many years ago to visualise searches in Trove's digitised newspapers. It shows you the number of articles each year that match your query — instead of a page of search results, you see the complete result set. You can look for patterns and trends across time.\n", "\n", "This is a deconstructed, extended, and hackable version of QueryPic." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from requests.exceptions import HTTPError, Timeout\n", "import os\n", "import ipywidgets as widgets\n", "from operator import itemgetter # used for sorting\n", "import pandas as pd # makes manipulating the data easier\n", "import altair as alt\n", "from requests.adapters import HTTPAdapter\n", "from requests.packages.urllib3.util.retry import Retry\n", "from tqdm import tqdm_notebook\n", "from IPython.display import display, HTML, FileLink, clear_output\n", "import math\n", "from collections import OrderedDict\n", "import time\n", "\n", "# Make sure data directory exists\n", "os.makedirs('data', exist_ok=True)\n", "\n", "# Set up Altair\n", "alt.renderers.enable('notebook')\n", "\n", "# Create a session that will automatically retry on server errors\n", "s = requests.Session()\n", "retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])\n", "s.mount('http://', HTTPAdapter(max_retries=retries))\n", "s.mount('https://', HTTPAdapter(max_retries=retries))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Enter your Trove API key\n", "\n", "Get your own [Trove API key](http://help.nla.gov.au/trove/building-with-trove/api) and enter it below." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "api_key = widgets.Text(\n", " placeholder='Enter your Trove API key',\n", " description='API key:',\n", " disabled=False\n", ")\n", "display(api_key)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "params = {\n", " 'q': ' ', # A space to search for everything\n", " 'facet': 'year',\n", " 'zone': 'newspaper',\n", " 'l-category': 'Article',\n", " 'encoding': 'json',\n", " 'n': 0\n", "}\n", "\n", "results = widgets.Output()\n", "save_data = widgets.Output()\n", "df = None" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_results(params):\n", " '''\n", " Get JSON response data from the Trove API.\n", " Parameters:\n", " params\n", " Returns:\n", " JSON formatted response data from Trove API \n", " '''\n", " response = s.get('https://api.trove.nla.gov.au/v2/result', params=params, timeout=30)\n", " response.raise_for_status()\n", " # print(response.url) # This shows us the url that's sent to the API\n", " data = response.json()\n", " return data\n", "\n", "def get_facets(data):\n", " '''\n", " Loop through facets in Trove API response, saving terms and counts.\n", " Parameters:\n", " data - JSON formatted response data from Trove API \n", " Returns:\n", " A list of dictionaries containing: 'year', 'total_results'\n", " '''\n", " facets = []\n", " try:\n", " for term in data['response']['zone'][0]['facets']['facet']['term']:\n", " if int(term['display']) >= date_range.value[0] and int(term['display']) <= date_range.value[1]:\n", " facets.append({'year': int(term['display']), 'total_results': int(term['count'])})\n", " facets.sort(key=itemgetter('year'))\n", " except TypeError:\n", " pass\n", " return facets\n", "\n", "def combine_totals(query_data, total_data):\n", " '''\n", " Take facets data from the query search and a blank search (ie everything) for a decade and combine them.\n", " Parameters:\n", " query_data - list of dictionaries containing facets data from a query search\n", " total_data - list of dictionaries containing facets data from a blank search\n", " Returns:\n", " A list of dictionaries containing: 'year', 'total_results', 'total articles' \n", " '''\n", " combined_data = []\n", " query_data = get_facets(query_data)\n", " total_data = get_facets(total_data)\n", " for index, query_row in enumerate(query_data):\n", " total_row = total_data[index]\n", " query_row['total_articles'] = total_row['total_results']\n", " combined_data.append(query_row)\n", " return combined_data \n", "\n", "def year_totals(params):\n", " '''\n", " Generate a dataset for a search query.\n", " Parameters:\n", " query - search query\n", " Returns:\n", " A Pandas dataframe with three columns -- year, total_results, total_articles -- and one row per year.\n", " '''\n", " totals = []\n", " start_decade = math.floor(date_range.value[0] / 10)\n", " end_decade = math.floor(date_range.value[1] / 10) + 1\n", " query = params['q']\n", " with results:\n", " for decade in tqdm_notebook(range(start_decade, end_decade)):\n", " params['l-decade'] = decade\n", " params['q'] = query\n", " query_data = get_results(params)\n", " params['q'] = ' '\n", " total_data = get_results(params)\n", " combined_data = combine_totals(query_data, total_data)\n", " totals.extend(combined_data)\n", " totals.sort(key=itemgetter('year'))\n", " return totals\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set a date range" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "date_range = widgets.IntRangeSlider(\n", " value=[1803, 1954],\n", " min=1803,\n", " max=2018,\n", " step=1,\n", " description='Date range:',\n", " disabled=False,\n", " continuous_update=False,\n", " orientation='horizontal',\n", " readout=True,\n", " readout_format='0<4d',\n", " layout=widgets.Layout(width='50%')\n", ")\n", "display(date_range)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add your search queries\n", "\n", "You can just add a single search query to see how the number of matching articles vary over time. But you can also compare frequencies between queries, states, and newspapers:\n", "\n", "* Compare queries — `cat` vs `dog`\n", "* Compare states — `swimmers` in NSW, Victoria, and Queensland\n", "* Compare newspapers — `protectionism` in *The Age* vs *The Argus*" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "queries = []\n", "out = widgets.Output()\n", "\n", "@out.capture()\n", "def add_query(b):\n", " queries.append(query.value)\n", " query.value = ''\n", " print('Query {}: {}'.format(len(queries), queries[-1]))\n", "\n", "query = widgets.Text(\n", " placeholder='Enter your query then click the button to add',\n", " disabled=False,\n", " )\n", "\n", "query_button = widgets.Button(\n", " description='Add query',\n", " disabled=False,\n", " tooltip='Click to add query',\n", " icon=''\n", " )\n", "\n", "query_button.on_click(add_query)\n", "query_tip = widgets.HTML(value='A query can be anything you\\'d enter in the Trove simple search box — from a single keyword to a complex boolean expression. Add as many queries as you want.')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_titles(b):\n", " params = {\n", " 'encoding': 'json',\n", " 'key': api_key.value\n", " }\n", " response = requests.get('http://api.trove.nla.gov.au/v2/newspaper/titles', params=params)\n", " data = response.json()\n", " title_list = [(t['title'], {'id': t['id'], 'title': t['title']}) for t in data['response']['records']['newspaper']]\n", " title_list.sort(key=itemgetter(0))\n", " titles_sorted = OrderedDict(title_list)\n", " titles.options = titles_sorted\n", " \n", "title_query = widgets.Text(\n", " placeholder='Enter your query',\n", " description='Search for:',\n", " disabled=False,\n", " )\n", "titles = widgets.SelectMultiple(\n", " options=['Click on button to load titles'],\n", " rows=10,\n", " description='In:',\n", " disabled=False,\n", " layout=widgets.Layout(width='50%')\n", " )\n", "titles_button = widgets.Button(\n", " description='Load titles',\n", " disabled=False,\n", " button_style='', # 'success', 'info', 'warning', 'danger' or ''\n", " tooltip='Click to load titles',\n", " icon=''\n", " )\n", "titles_button.on_click(get_titles)\n", "titles_tip = widgets.HTML(value='Use Shift or Cmd/Ctrl to select multiple newspapers to compare.')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "state_query = widgets.Text(\n", " placeholder='Enter your query',\n", " description='Search for:',\n", " disabled=False,\n", " )\n", " \n", "states = widgets.SelectMultiple(\n", " options=[\n", " 'ACT',\n", " 'New South Wales',\n", " 'Queensland',\n", " 'South Australia',\n", " 'Northern Territory',\n", " 'Tasmania',\n", " 'Victoria',\n", " 'Western Australia',\n", " 'National',\n", " 'International'\n", " ],\n", " rows=10,\n", " description='In:',\n", " disabled=False,\n", " layout=widgets.Layout(width='50%')\n", ")\n", "\n", "states_tip = widgets.HTML(value='Use Shift or Cmd/Ctrl to select multiple states to compare.')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def plot_raw_results(width=700, height=400):\n", " chart = alt.Chart(df).mark_line(point=True).encode(\n", " x=alt.X('year:Q', axis=alt.Axis(format='c', title='Year')),\n", " y=alt.Y('total_results:Q', axis=alt.Axis(format=',d', title='Number of articles')),\n", " color=alt.Color('query', legend=alt.Legend(title='')),\n", " tooltip=[alt.Tooltip('query', title='Query:'), alt.Tooltip('year:Q', title='Year'), alt.Tooltip('total_results:Q', title='Articles', format=',')]\n", " ).properties(width=width, height=height).interactive()\n", " return chart\n", "\n", "def plot_relative_results(width=700, height=400):\n", " chart = alt.Chart(df).mark_line(point=True).encode(\n", " x=alt.X('year:Q', axis=alt.Axis(format='c', title='Year')),\n", " y=alt.Y('PercentOfTotal:Q', axis=alt.Axis(format='.2%', title='Percentage of total articles')),\n", " color=alt.Color('query', legend=alt.Legend(title='')),\n", " tooltip=[alt.Tooltip('query', title='Query:'), alt.Tooltip('year:Q', title='Year'), alt.Tooltip('PercentOfTotal:Q', title='Articles', format='.2%')]\n", " ).properties(width=width, height=height).transform_calculate(\n", " PercentOfTotal=\"datum.total_results / datum.total_articles\"\n", " ).interactive()\n", " return chart" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def clear_all(b):\n", " states.value = []\n", " state_query.value = ''\n", " titles.value = []\n", " title_query.value = ''\n", " out.clear_output()\n", " queries.clear()\n", " results.clear_output()\n", "\n", "def get_data(b):\n", " global df\n", " results.clear_output()\n", " traces = []\n", " q_params = params.copy()\n", " q_params['key'] = api_key.value\n", " if tab.selected_index == 0:\n", " for query in queries:\n", " q_params['q'] = query\n", " with results:\n", " display(HTML('Searching for {}...'.format(query)))\n", " totals = year_totals(q_params.copy())\n", " df_totals = pd.DataFrame(totals)\n", " df_totals['query'] = query\n", " traces.append(df_totals)\n", " elif tab.selected_index == 1:\n", " q_params['q'] = state_query.value\n", " for state in states.value:\n", " q_params['l-state'] = state\n", " with results:\n", " display(HTML('Searching in {}...'.format(state)))\n", " totals = year_totals(q_params.copy())\n", " df_totals = pd.DataFrame(totals)\n", " df_totals['query'] = state\n", " traces.append(df_totals)\n", " elif tab.selected_index == 2:\n", " q_params['q'] = title_query.value\n", " for title in titles.value:\n", " q_params['l-title'] = title['id']\n", " with results:\n", " display(HTML('Searching in {}...'.format(title['title'])))\n", " totals = year_totals(q_params.copy())\n", " df_totals = pd.DataFrame(totals)\n", " df_totals['query'] = title['title']\n", " traces.append(df_totals)\n", " try:\n", " df = pd.concat(traces, ignore_index=True)\n", " except ValueError:\n", " with results:\n", " display(HTML('No results!'))\n", " else:\n", " results.clear_output(wait=True)\n", " chart = plot_relative_results()\n", " chart_type.value = 'proportion'\n", " csv_file = save_as_csv()\n", " with results:\n", " display(chart_type)\n", " display(chart)\n", " with save_data:\n", " display(HTML('Download data:'), FileLink(csv_file))\n", " display(widgets.HBox([save_chart_button, save_chart_width, save_chart_height]))\n", "\n", "def save_chart(b):\n", " width = save_chart_width.value\n", " height = save_chart_height.value\n", " if chart_type.value == 'proportion':\n", " chart = plot_relative_results(width, height)\n", " else:\n", " chart = plot_raw_results(width, height)\n", " filename = 'data/querypic-{}.html'.format(int(time.time()))\n", " chart.save(filename)\n", " display(HTML('View HTML version:'), FileLink(filename))\n", " \n", " \n", "def save_as_csv():\n", " filename = 'data/querypic-{}.csv'.format(int(time.time()))\n", " df.to_csv(filename, index=False)\n", " return filename\n", "\n", "def change_chart(o):\n", " results.clear_output(wait=True)\n", " if chart_type.value == 'proportion':\n", " chart = plot_relative_results()\n", " else:\n", " chart = plot_raw_results()\n", " with results:\n", " display(chart_type)\n", " display(chart)\n", "\n", "chart_type = widgets.Dropdown(\n", " options=[('Raw number of results', 'raw'), ('Proportion of total articles', 'proportion')],\n", " value='proportion'\n", " )\n", "\n", "chart_type.observe(change_chart)\n", " \n", "clear_all_button = widgets.Button(\n", " description='Clear all',\n", " disabled=False,\n", " button_style='', # 'success', 'info', 'warning', 'danger' or ''\n", " tooltip='Clear current queries',\n", " icon=''\n", " )\n", "\n", "get_data_button = widgets.Button(\n", " description='Create chart',\n", " disabled=False,\n", " button_style='primary', # 'success', 'info', 'warning', 'danger' or ''\n", " tooltip='Create chart',\n", " icon=''\n", " )\n", "\n", "save_chart_button = widgets.Button(\n", " description='Save chart',\n", " disabled=False,\n", " button_style='primary', # 'success', 'info', 'warning', 'danger' or ''\n", " tooltip='Save chart as HTML',\n", " icon=''\n", " )\n", "\n", "save_chart_width = widgets.BoundedIntText(\n", " value=700,\n", " min=700,\n", " max=2000,\n", " step=100,\n", " description='Width',\n", " disabled=False\n", ")\n", "\n", "save_chart_height = widgets.BoundedIntText(\n", " value=400,\n", " min=400,\n", " max=1500,\n", " step=100,\n", " description='Height',\n", " disabled=False\n", ")\n", "\n", "clear_all_button.on_click(clear_all)\n", "get_data_button.on_click(get_data)\n", "save_chart_button.on_click(save_chart)\n", "tab1 = widgets.VBox([widgets.HBox([query, query_button]), query_tip, out])\n", "tab2 = widgets.VBox([state_query, states, states_tip])\n", "tab3 = widgets.VBox([title_query, widgets.HBox([titles, titles_button]), titles_tip])\n", "\n", "tab = widgets.Tab(children=[tab1, tab2, tab3])\n", "tab.set_title(0, 'Compare queries')\n", "tab.set_title(1, 'Compare states')\n", "tab.set_title(2, 'Compare newspapers')\n", "display(widgets.VBox([tab, widgets.HBox([get_data_button, clear_all_button]), results, save_data]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }