{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# QueryPic deconstructed\n",
"#### Visualise searches in Trove's digitised newspapers"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[QueryPic](http://dhistory.org/querypic/) is a tool I created many years ago to visualise searches in Trove's digitised newspapers. It shows you the number of articles each year that match your query — instead of a page of search results, you see the complete result set. You can look for patterns and trends across time.\n",
"\n",
"This is a deconstructed, extended, and hackable version of QueryPic."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"from requests.exceptions import HTTPError, Timeout\n",
"import os\n",
"import ipywidgets as widgets\n",
"from operator import itemgetter # used for sorting\n",
"import pandas as pd # makes manipulating the data easier\n",
"import altair as alt\n",
"from requests.adapters import HTTPAdapter\n",
"from requests.packages.urllib3.util.retry import Retry\n",
"from tqdm import tqdm_notebook\n",
"from IPython.display import display, HTML, FileLink, clear_output\n",
"import math\n",
"from collections import OrderedDict\n",
"import time\n",
"\n",
"# Make sure data directory exists\n",
"os.makedirs('data', exist_ok=True)\n",
"\n",
"# Set up Altair\n",
"alt.renderers.enable('notebook')\n",
"\n",
"# Create a session that will automatically retry on server errors\n",
"s = requests.Session()\n",
"retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])\n",
"s.mount('http://', HTTPAdapter(max_retries=retries))\n",
"s.mount('https://', HTTPAdapter(max_retries=retries))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Enter your Trove API key\n",
"\n",
"Get your own [Trove API key](http://help.nla.gov.au/trove/building-with-trove/api) and enter it below."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"api_key = widgets.Text(\n",
" placeholder='Enter your Trove API key',\n",
" description='API key:',\n",
" disabled=False\n",
")\n",
"display(api_key)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"params = {\n",
" 'q': ' ', # A space to search for everything\n",
" 'facet': 'year',\n",
" 'zone': 'newspaper',\n",
" 'l-category': 'Article',\n",
" 'encoding': 'json',\n",
" 'n': 0\n",
"}\n",
"\n",
"results = widgets.Output()\n",
"save_data = widgets.Output()\n",
"df = None"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_results(params):\n",
" '''\n",
" Get JSON response data from the Trove API.\n",
" Parameters:\n",
" params\n",
" Returns:\n",
" JSON formatted response data from Trove API \n",
" '''\n",
" response = s.get('https://api.trove.nla.gov.au/v2/result', params=params, timeout=30)\n",
" response.raise_for_status()\n",
" # print(response.url) # This shows us the url that's sent to the API\n",
" data = response.json()\n",
" return data\n",
"\n",
"def get_facets(data):\n",
" '''\n",
" Loop through facets in Trove API response, saving terms and counts.\n",
" Parameters:\n",
" data - JSON formatted response data from Trove API \n",
" Returns:\n",
" A list of dictionaries containing: 'year', 'total_results'\n",
" '''\n",
" facets = []\n",
" try:\n",
" for term in data['response']['zone'][0]['facets']['facet']['term']:\n",
" if int(term['display']) >= date_range.value[0] and int(term['display']) <= date_range.value[1]:\n",
" facets.append({'year': int(term['display']), 'total_results': int(term['count'])})\n",
" facets.sort(key=itemgetter('year'))\n",
" except TypeError:\n",
" pass\n",
" return facets\n",
"\n",
"def combine_totals(query_data, total_data):\n",
" '''\n",
" Take facets data from the query search and a blank search (ie everything) for a decade and combine them.\n",
" Parameters:\n",
" query_data - list of dictionaries containing facets data from a query search\n",
" total_data - list of dictionaries containing facets data from a blank search\n",
" Returns:\n",
" A list of dictionaries containing: 'year', 'total_results', 'total articles' \n",
" '''\n",
" combined_data = []\n",
" query_data = get_facets(query_data)\n",
" total_data = get_facets(total_data)\n",
" for index, query_row in enumerate(query_data):\n",
" total_row = total_data[index]\n",
" query_row['total_articles'] = total_row['total_results']\n",
" combined_data.append(query_row)\n",
" return combined_data \n",
"\n",
"def year_totals(params):\n",
" '''\n",
" Generate a dataset for a search query.\n",
" Parameters:\n",
" query - search query\n",
" Returns:\n",
" A Pandas dataframe with three columns -- year, total_results, total_articles -- and one row per year.\n",
" '''\n",
" totals = []\n",
" start_decade = math.floor(date_range.value[0] / 10)\n",
" end_decade = math.floor(date_range.value[1] / 10) + 1\n",
" query = params['q']\n",
" with results:\n",
" for decade in tqdm_notebook(range(start_decade, end_decade)):\n",
" params['l-decade'] = decade\n",
" params['q'] = query\n",
" query_data = get_results(params)\n",
" params['q'] = ' '\n",
" total_data = get_results(params)\n",
" combined_data = combine_totals(query_data, total_data)\n",
" totals.extend(combined_data)\n",
" totals.sort(key=itemgetter('year'))\n",
" return totals\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Set a date range"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"date_range = widgets.IntRangeSlider(\n",
" value=[1803, 1954],\n",
" min=1803,\n",
" max=2018,\n",
" step=1,\n",
" description='Date range:',\n",
" disabled=False,\n",
" continuous_update=False,\n",
" orientation='horizontal',\n",
" readout=True,\n",
" readout_format='0<4d',\n",
" layout=widgets.Layout(width='50%')\n",
")\n",
"display(date_range)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Add your search queries\n",
"\n",
"You can just add a single search query to see how the number of matching articles vary over time. But you can also compare frequencies between queries, states, and newspapers:\n",
"\n",
"* Compare queries — `cat` vs `dog`\n",
"* Compare states — `swimmers` in NSW, Victoria, and Queensland\n",
"* Compare newspapers — `protectionism` in *The Age* vs *The Argus*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"queries = []\n",
"out = widgets.Output()\n",
"\n",
"@out.capture()\n",
"def add_query(b):\n",
" queries.append(query.value)\n",
" query.value = ''\n",
" print('Query {}: {}'.format(len(queries), queries[-1]))\n",
"\n",
"query = widgets.Text(\n",
" placeholder='Enter your query then click the button to add',\n",
" disabled=False,\n",
" )\n",
"\n",
"query_button = widgets.Button(\n",
" description='Add query',\n",
" disabled=False,\n",
" tooltip='Click to add query',\n",
" icon=''\n",
" )\n",
"\n",
"query_button.on_click(add_query)\n",
"query_tip = widgets.HTML(value='A query can be anything you\\'d enter in the Trove simple search box — from a single keyword to a complex boolean expression. Add as many queries as you want.')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_titles(b):\n",
" params = {\n",
" 'encoding': 'json',\n",
" 'key': api_key.value\n",
" }\n",
" response = requests.get('http://api.trove.nla.gov.au/v2/newspaper/titles', params=params)\n",
" data = response.json()\n",
" title_list = [(t['title'], {'id': t['id'], 'title': t['title']}) for t in data['response']['records']['newspaper']]\n",
" title_list.sort(key=itemgetter(0))\n",
" titles_sorted = OrderedDict(title_list)\n",
" titles.options = titles_sorted\n",
" \n",
"title_query = widgets.Text(\n",
" placeholder='Enter your query',\n",
" description='Search for:',\n",
" disabled=False,\n",
" )\n",
"titles = widgets.SelectMultiple(\n",
" options=['Click on button to load titles'],\n",
" rows=10,\n",
" description='In:',\n",
" disabled=False,\n",
" layout=widgets.Layout(width='50%')\n",
" )\n",
"titles_button = widgets.Button(\n",
" description='Load titles',\n",
" disabled=False,\n",
" button_style='', # 'success', 'info', 'warning', 'danger' or ''\n",
" tooltip='Click to load titles',\n",
" icon=''\n",
" )\n",
"titles_button.on_click(get_titles)\n",
"titles_tip = widgets.HTML(value='Use Shift or Cmd/Ctrl to select multiple newspapers to compare.')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"state_query = widgets.Text(\n",
" placeholder='Enter your query',\n",
" description='Search for:',\n",
" disabled=False,\n",
" )\n",
" \n",
"states = widgets.SelectMultiple(\n",
" options=[\n",
" 'ACT',\n",
" 'New South Wales',\n",
" 'Queensland',\n",
" 'South Australia',\n",
" 'Northern Territory',\n",
" 'Tasmania',\n",
" 'Victoria',\n",
" 'Western Australia',\n",
" 'National',\n",
" 'International'\n",
" ],\n",
" rows=10,\n",
" description='In:',\n",
" disabled=False,\n",
" layout=widgets.Layout(width='50%')\n",
")\n",
"\n",
"states_tip = widgets.HTML(value='Use Shift or Cmd/Ctrl to select multiple states to compare.')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def plot_raw_results(width=700, height=400):\n",
" chart = alt.Chart(df).mark_line(point=True).encode(\n",
" x=alt.X('year:Q', axis=alt.Axis(format='c', title='Year')),\n",
" y=alt.Y('total_results:Q', axis=alt.Axis(format=',d', title='Number of articles')),\n",
" color=alt.Color('query', legend=alt.Legend(title='')),\n",
" tooltip=[alt.Tooltip('query', title='Query:'), alt.Tooltip('year:Q', title='Year'), alt.Tooltip('total_results:Q', title='Articles', format=',')]\n",
" ).properties(width=width, height=height).interactive()\n",
" return chart\n",
"\n",
"def plot_relative_results(width=700, height=400):\n",
" chart = alt.Chart(df).mark_line(point=True).encode(\n",
" x=alt.X('year:Q', axis=alt.Axis(format='c', title='Year')),\n",
" y=alt.Y('PercentOfTotal:Q', axis=alt.Axis(format='.2%', title='Percentage of total articles')),\n",
" color=alt.Color('query', legend=alt.Legend(title='')),\n",
" tooltip=[alt.Tooltip('query', title='Query:'), alt.Tooltip('year:Q', title='Year'), alt.Tooltip('PercentOfTotal:Q', title='Articles', format='.2%')]\n",
" ).properties(width=width, height=height).transform_calculate(\n",
" PercentOfTotal=\"datum.total_results / datum.total_articles\"\n",
" ).interactive()\n",
" return chart"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def clear_all(b):\n",
" states.value = []\n",
" state_query.value = ''\n",
" titles.value = []\n",
" title_query.value = ''\n",
" out.clear_output()\n",
" queries.clear()\n",
" results.clear_output()\n",
"\n",
"def get_data(b):\n",
" global df\n",
" results.clear_output()\n",
" traces = []\n",
" q_params = params.copy()\n",
" q_params['key'] = api_key.value\n",
" if tab.selected_index == 0:\n",
" for query in queries:\n",
" q_params['q'] = query\n",
" with results:\n",
" display(HTML('Searching for {}...'.format(query)))\n",
" totals = year_totals(q_params.copy())\n",
" df_totals = pd.DataFrame(totals)\n",
" df_totals['query'] = query\n",
" traces.append(df_totals)\n",
" elif tab.selected_index == 1:\n",
" q_params['q'] = state_query.value\n",
" for state in states.value:\n",
" q_params['l-state'] = state\n",
" with results:\n",
" display(HTML('Searching in {}...'.format(state)))\n",
" totals = year_totals(q_params.copy())\n",
" df_totals = pd.DataFrame(totals)\n",
" df_totals['query'] = state\n",
" traces.append(df_totals)\n",
" elif tab.selected_index == 2:\n",
" q_params['q'] = title_query.value\n",
" for title in titles.value:\n",
" q_params['l-title'] = title['id']\n",
" with results:\n",
" display(HTML('Searching in {}...'.format(title['title'])))\n",
" totals = year_totals(q_params.copy())\n",
" df_totals = pd.DataFrame(totals)\n",
" df_totals['query'] = title['title']\n",
" traces.append(df_totals)\n",
" try:\n",
" df = pd.concat(traces, ignore_index=True)\n",
" except ValueError:\n",
" with results:\n",
" display(HTML('No results!'))\n",
" else:\n",
" results.clear_output(wait=True)\n",
" chart = plot_relative_results()\n",
" chart_type.value = 'proportion'\n",
" csv_file = save_as_csv()\n",
" with results:\n",
" display(chart_type)\n",
" display(chart)\n",
" with save_data:\n",
" display(HTML('Download data:'), FileLink(csv_file))\n",
" display(widgets.HBox([save_chart_button, save_chart_width, save_chart_height]))\n",
"\n",
"def save_chart(b):\n",
" width = save_chart_width.value\n",
" height = save_chart_height.value\n",
" if chart_type.value == 'proportion':\n",
" chart = plot_relative_results(width, height)\n",
" else:\n",
" chart = plot_raw_results(width, height)\n",
" filename = 'data/querypic-{}.html'.format(int(time.time()))\n",
" chart.save(filename)\n",
" display(HTML('View HTML version:'), FileLink(filename))\n",
" \n",
" \n",
"def save_as_csv():\n",
" filename = 'data/querypic-{}.csv'.format(int(time.time()))\n",
" df.to_csv(filename, index=False)\n",
" return filename\n",
"\n",
"def change_chart(o):\n",
" results.clear_output(wait=True)\n",
" if chart_type.value == 'proportion':\n",
" chart = plot_relative_results()\n",
" else:\n",
" chart = plot_raw_results()\n",
" with results:\n",
" display(chart_type)\n",
" display(chart)\n",
"\n",
"chart_type = widgets.Dropdown(\n",
" options=[('Raw number of results', 'raw'), ('Proportion of total articles', 'proportion')],\n",
" value='proportion'\n",
" )\n",
"\n",
"chart_type.observe(change_chart)\n",
" \n",
"clear_all_button = widgets.Button(\n",
" description='Clear all',\n",
" disabled=False,\n",
" button_style='', # 'success', 'info', 'warning', 'danger' or ''\n",
" tooltip='Clear current queries',\n",
" icon=''\n",
" )\n",
"\n",
"get_data_button = widgets.Button(\n",
" description='Create chart',\n",
" disabled=False,\n",
" button_style='primary', # 'success', 'info', 'warning', 'danger' or ''\n",
" tooltip='Create chart',\n",
" icon=''\n",
" )\n",
"\n",
"save_chart_button = widgets.Button(\n",
" description='Save chart',\n",
" disabled=False,\n",
" button_style='primary', # 'success', 'info', 'warning', 'danger' or ''\n",
" tooltip='Save chart as HTML',\n",
" icon=''\n",
" )\n",
"\n",
"save_chart_width = widgets.BoundedIntText(\n",
" value=700,\n",
" min=700,\n",
" max=2000,\n",
" step=100,\n",
" description='Width',\n",
" disabled=False\n",
")\n",
"\n",
"save_chart_height = widgets.BoundedIntText(\n",
" value=400,\n",
" min=400,\n",
" max=1500,\n",
" step=100,\n",
" description='Height',\n",
" disabled=False\n",
")\n",
"\n",
"clear_all_button.on_click(clear_all)\n",
"get_data_button.on_click(get_data)\n",
"save_chart_button.on_click(save_chart)\n",
"tab1 = widgets.VBox([widgets.HBox([query, query_button]), query_tip, out])\n",
"tab2 = widgets.VBox([state_query, states, states_tip])\n",
"tab3 = widgets.VBox([title_query, widgets.HBox([titles, titles_button]), titles_tip])\n",
"\n",
"tab = widgets.Tab(children=[tab1, tab2, tab3])\n",
"tab.set_title(0, 'Compare queries')\n",
"tab.set_title(1, 'Compare states')\n",
"tab.set_title(2, 'Compare newspapers')\n",
"display(widgets.VBox([tab, widgets.HBox([get_data_button, clear_all_button]), results, save_data]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}