{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# QueryPic deconstructed\n",
"#### Visualise searches in Trove's digitised newspapers"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[QueryPic](http://dhistory.org/querypic/) is a tool I created many years ago to visualise searches in Trove's digitised newspapers. It shows you the number of articles each year that match your query — instead of a page of search results, you see the complete result set. You can look for patterns and trends across time.\n",
"\n",
"This is a deconstructed, extended, and hackable version of QueryPic."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import math\n",
"import os\n",
"import time\n",
"from collections import OrderedDict\n",
"from operator import itemgetter # used for sorting\n",
"\n",
"import altair as alt\n",
"import ipywidgets as widgets\n",
"import pandas as pd # makes manipulating the data easier\n",
"import requests\n",
"from IPython.display import HTML, FileLink, display\n",
"from requests.adapters import HTTPAdapter\n",
"from requests.packages.urllib3.util.retry import Retry\n",
"from tqdm.auto import tqdm\n",
"\n",
"# Make sure data directory exists\n",
"os.makedirs(\"data\", exist_ok=True)\n",
"\n",
"# Create a session that will automatically retry on server errors\n",
"s = requests.Session()\n",
"retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])\n",
"s.mount(\"http://\", HTTPAdapter(max_retries=retries))\n",
"s.mount(\"https://\", HTTPAdapter(max_retries=retries))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%capture\n",
"# Load env variables\n",
"%load_ext dotenv\n",
"%dotenv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Enter your Trove API key\n",
"\n",
"Get your own [Trove API key](https://trove.nla.gov.au/about/create-something/using-api) and enter it below."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"api_key = widgets.Text(\n",
" placeholder=\"Enter your Trove API key\", description=\"API key:\", disabled=False\n",
")\n",
"display(api_key)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"params = {\n",
" \"q\": \" \", # A space to search for everything\n",
" \"facet\": \"year\",\n",
" \"zone\": \"newspaper\",\n",
" # 'l-category': 'Article',\n",
" \"encoding\": \"json\",\n",
" \"n\": 0,\n",
"}\n",
"\n",
"results = widgets.Output()\n",
"save_data = widgets.Output()\n",
"df = None"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_results(params):\n",
" \"\"\"\n",
" Get JSON response data from the Trove API.\n",
" Parameters:\n",
" params\n",
" Returns:\n",
" JSON formatted response data from Trove API\n",
" \"\"\"\n",
" response = s.get(\n",
" \"https://api.trove.nla.gov.au/v2/result\", params=params, timeout=30\n",
" )\n",
" response.raise_for_status()\n",
" # print(response.url) # This shows us the url that's sent to the API\n",
" data = response.json()\n",
" return data\n",
"\n",
"\n",
"def get_facets(data):\n",
" \"\"\"\n",
" Loop through facets in Trove API response, saving terms and counts.\n",
" Parameters:\n",
" data - JSON formatted response data from Trove API\n",
" Returns:\n",
" A list of dictionaries containing: 'year', 'total_results'\n",
" \"\"\"\n",
" facets = []\n",
" try:\n",
" for term in data[\"response\"][\"zone\"][0][\"facets\"][\"facet\"][\"term\"]:\n",
" if (\n",
" int(term[\"display\"]) >= date_range.value[0]\n",
" and int(term[\"display\"]) <= date_range.value[1]\n",
" ):\n",
" facets.append(\n",
" {\"year\": int(term[\"display\"]), \"total_results\": int(term[\"count\"])}\n",
" )\n",
" facets.sort(key=itemgetter(\"year\"))\n",
" except TypeError:\n",
" pass\n",
" return facets\n",
"\n",
"\n",
"def combine_totals(query_data, total_data):\n",
" \"\"\"\n",
" Take facets data from the query search and a blank search (ie everything) for a decade and combine them.\n",
" Parameters:\n",
" query_data - list of dictionaries containing facets data from a query search\n",
" total_data - list of dictionaries containing facets data from a blank search\n",
" Returns:\n",
" A list of dictionaries containing: 'year', 'total_results', 'total articles'\n",
" \"\"\"\n",
" combined_data = []\n",
" query_data = get_facets(query_data)\n",
" total_data = get_facets(total_data)\n",
" for index, query_row in enumerate(query_data):\n",
" total_row = total_data[index]\n",
" query_row[\"total_articles\"] = total_row[\"total_results\"]\n",
" combined_data.append(query_row)\n",
" return combined_data\n",
"\n",
"\n",
"def year_totals(params):\n",
" \"\"\"\n",
" Generate a dataset for a search query.\n",
" Parameters:\n",
" query - search query\n",
" Returns:\n",
" A Pandas dataframe with three columns -- year, total_results, total_articles -- and one row per year.\n",
" \"\"\"\n",
" totals = []\n",
" start_decade = math.floor(date_range.value[0] / 10)\n",
" end_decade = math.floor(date_range.value[1] / 10) + 1\n",
" query = params[\"q\"]\n",
" with results:\n",
" for decade in tqdm(range(start_decade, end_decade)):\n",
" params[\"l-decade\"] = decade\n",
" params[\"q\"] = query\n",
" query_data = get_results(params)\n",
" params[\"q\"] = \" \"\n",
" total_data = get_results(params)\n",
" combined_data = combine_totals(query_data, total_data)\n",
" totals.extend(combined_data)\n",
" totals.sort(key=itemgetter(\"year\"))\n",
" return totals"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Set a date range"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"date_range = widgets.IntRangeSlider(\n",
" value=[1803, 1954],\n",
" min=1803,\n",
" max=2018,\n",
" step=1,\n",
" description=\"Date range:\",\n",
" disabled=False,\n",
" continuous_update=False,\n",
" orientation=\"horizontal\",\n",
" readout=True,\n",
" readout_format=\"0<4d\",\n",
" layout=widgets.Layout(width=\"50%\"),\n",
")\n",
"display(date_range)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Add your search queries\n",
"\n",
"You can just add a single search query to see how the number of matching articles vary over time. But you can also compare frequencies between queries, states, and newspapers:\n",
"\n",
"* Compare queries — `cat` vs `dog`\n",
"* Compare states — `swimmers` in NSW, Victoria, and Queensland\n",
"* Compare newspapers — `protectionism` in *The Age* vs *The Argus*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"queries = []\n",
"out = widgets.Output()\n",
"\n",
"\n",
"@out.capture()\n",
"def add_query(b):\n",
" queries.append(query.value)\n",
" query.value = \"\"\n",
" print(\"Query {}: {}\".format(len(queries), queries[-1]))\n",
"\n",
"\n",
"query = widgets.Text(\n",
" placeholder=\"Enter your query then click the button to add\",\n",
" disabled=False,\n",
")\n",
"\n",
"query_button = widgets.Button(\n",
" description=\"Add query\", disabled=False, tooltip=\"Click to add query\", icon=\"\"\n",
")\n",
"\n",
"query_button.on_click(add_query)\n",
"query_tip = widgets.HTML(\n",
" value=\"A query can be anything you'd enter in the Trove simple search box — from a single keyword to a complex boolean expression. Add as many queries as you want.\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_titles(b):\n",
" params = {\"encoding\": \"json\", \"key\": api_key.value}\n",
" response = requests.get(\n",
" \"http://api.trove.nla.gov.au/v2/newspaper/titles\", params=params\n",
" )\n",
" data = response.json()\n",
" title_list = [\n",
" (t[\"title\"], {\"id\": t[\"id\"], \"title\": t[\"title\"]})\n",
" for t in data[\"response\"][\"records\"][\"newspaper\"]\n",
" ]\n",
" title_list.sort(key=itemgetter(0))\n",
" titles_sorted = OrderedDict(title_list)\n",
" titles.options = titles_sorted\n",
"\n",
"\n",
"title_query = widgets.Text(\n",
" placeholder=\"Enter your query\",\n",
" description=\"Search for:\",\n",
" disabled=False,\n",
")\n",
"titles = widgets.SelectMultiple(\n",
" options=[\"Click on button to load titles\"],\n",
" rows=10,\n",
" description=\"In:\",\n",
" disabled=False,\n",
" layout=widgets.Layout(width=\"50%\"),\n",
")\n",
"titles_button = widgets.Button(\n",
" description=\"Load titles\",\n",
" disabled=False,\n",
" button_style=\"\", # 'success', 'info', 'warning', 'danger' or ''\n",
" tooltip=\"Click to load titles\",\n",
" icon=\"\",\n",
")\n",
"titles_button.on_click(get_titles)\n",
"titles_tip = widgets.HTML(\n",
" value=\"Use Shift or Cmd/Ctrl to select multiple newspapers to compare.\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"state_query = widgets.Text(\n",
" placeholder=\"Enter your query\",\n",
" description=\"Search for:\",\n",
" disabled=False,\n",
")\n",
"\n",
"states = widgets.SelectMultiple(\n",
" options=[\n",
" \"ACT\",\n",
" \"New South Wales\",\n",
" \"Queensland\",\n",
" \"South Australia\",\n",
" \"Northern Territory\",\n",
" \"Tasmania\",\n",
" \"Victoria\",\n",
" \"Western Australia\",\n",
" \"National\",\n",
" \"International\",\n",
" ],\n",
" rows=10,\n",
" description=\"In:\",\n",
" disabled=False,\n",
" layout=widgets.Layout(width=\"50%\"),\n",
")\n",
"\n",
"states_tip = widgets.HTML(\n",
" value=\"Use Shift or Cmd/Ctrl to select multiple states to compare.\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def plot_raw_results(width=700, height=400):\n",
" chart = (\n",
" alt.Chart(df)\n",
" .mark_line(point=True)\n",
" .encode(\n",
" x=alt.X(\"year:Q\", axis=alt.Axis(format=\"c\", title=\"Year\")),\n",
" y=alt.Y(\n",
" \"total_results:Q\",\n",
" axis=alt.Axis(format=\",d\", title=\"Number of articles\"),\n",
" ),\n",
" color=alt.Color(\"query\", legend=alt.Legend(title=\"\")),\n",
" tooltip=[\n",
" alt.Tooltip(\"query\", title=\"Query:\"),\n",
" alt.Tooltip(\"year:Q\", title=\"Year\"),\n",
" alt.Tooltip(\"total_results:Q\", title=\"Articles\", format=\",\"),\n",
" ],\n",
" )\n",
" .properties(width=width, height=height)\n",
" )\n",
" return chart\n",
"\n",
"\n",
"def plot_relative_results(width=700, height=400):\n",
" chart = (\n",
" alt.Chart(df)\n",
" .mark_line(point=True)\n",
" .encode(\n",
" x=alt.X(\"year:Q\", axis=alt.Axis(format=\"c\", title=\"Year\")),\n",
" y=alt.Y(\n",
" \"PercentOfTotal:Q\",\n",
" axis=alt.Axis(format=\".2%\", title=\"Percentage of total articles\"),\n",
" ),\n",
" color=alt.Color(\"query\", legend=alt.Legend(title=\"\")),\n",
" tooltip=[\n",
" alt.Tooltip(\"query\", title=\"Query:\"),\n",
" alt.Tooltip(\"year:Q\", title=\"Year\"),\n",
" alt.Tooltip(\"PercentOfTotal:Q\", title=\"Articles\", format=\".2%\"),\n",
" ],\n",
" )\n",
" .properties(width=width, height=height)\n",
" .transform_calculate(\n",
" PercentOfTotal=\"datum.total_results / datum.total_articles\"\n",
" )\n",
" )\n",
" return chart"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def clear_all(b):\n",
" states.value = []\n",
" state_query.value = \"\"\n",
" titles.value = []\n",
" title_query.value = \"\"\n",
" out.clear_output()\n",
" queries.clear()\n",
" results.clear_output()\n",
" save_data.clear_output()\n",
"\n",
"\n",
"def get_data(b):\n",
" global df\n",
" results.clear_output()\n",
" save_data.clear_output()\n",
" traces = []\n",
" q_params = params.copy()\n",
" q_params[\"key\"] = api_key.value\n",
" if tab.selected_index == 0:\n",
" for query in queries:\n",
" q_params[\"q\"] = query\n",
" with results:\n",
" display(HTML(\"Searching for {}...\".format(query)))\n",
" totals = year_totals(q_params.copy())\n",
" df_totals = pd.DataFrame(totals)\n",
" df_totals[\"query\"] = query\n",
" traces.append(df_totals)\n",
" elif tab.selected_index == 1:\n",
" q_params[\"q\"] = state_query.value\n",
" for state in states.value:\n",
" q_params[\"l-state\"] = state\n",
" with results:\n",
" display(HTML(\"Searching in {}...\".format(state)))\n",
" totals = year_totals(q_params.copy())\n",
" df_totals = pd.DataFrame(totals)\n",
" df_totals[\"query\"] = state\n",
" traces.append(df_totals)\n",
" elif tab.selected_index == 2:\n",
" q_params[\"q\"] = title_query.value\n",
" for title in titles.value:\n",
" q_params[\"l-title\"] = title[\"id\"]\n",
" with results:\n",
" display(HTML(\"Searching in {}...\".format(title[\"title\"])))\n",
" totals = year_totals(q_params.copy())\n",
" df_totals = pd.DataFrame(totals)\n",
" df_totals[\"query\"] = title[\"title\"]\n",
" traces.append(df_totals)\n",
" try:\n",
" df = pd.concat(traces, ignore_index=True)\n",
" except ValueError:\n",
" with results:\n",
" display(HTML(\"No results!\"))\n",
" else:\n",
" results.clear_output()\n",
" chart = plot_raw_results()\n",
" chart_type.value = \"raw\"\n",
" csv_file = save_as_csv()\n",
" with results:\n",
" display(chart_type)\n",
" display(chart)\n",
" with save_data:\n",
" display(\n",
" HTML(f'Download data: {csv_file}')\n",
" )\n",
" display(\n",
" widgets.HBox([save_chart_button, save_chart_width, save_chart_height])\n",
" )\n",
"\n",
"\n",
"def save_chart(b):\n",
" width = save_chart_width.value\n",
" height = save_chart_height.value\n",
" if chart_type.value == \"proportion\":\n",
" chart = plot_relative_results(width, height)\n",
" else:\n",
" chart = plot_raw_results(width, height)\n",
" filename = \"data/querypic-{}.html\".format(int(time.time()))\n",
" chart.save(filename)\n",
" with save_data:\n",
" display(HTML(\"View HTML version:\"), FileLink(filename))\n",
"\n",
"\n",
"def save_as_csv():\n",
" filename = \"data/querypic-{}.csv\".format(int(time.time()))\n",
" df.to_csv(filename, index=False)\n",
" return filename\n",
"\n",
"\n",
"def change_chart(o):\n",
" results.clear_output(wait=True)\n",
" if chart_type.value == \"proportion\":\n",
" chart = plot_relative_results()\n",
" else:\n",
" chart = plot_raw_results()\n",
" with results:\n",
" display(chart_type)\n",
" display(chart)\n",
"\n",
"\n",
"chart_type = widgets.Dropdown(\n",
" options=[\n",
" (\"Raw number of results\", \"raw\"),\n",
" (\"Proportion of total articles\", \"proportion\"),\n",
" ],\n",
" value=\"raw\",\n",
")\n",
"\n",
"chart_type.observe(change_chart)\n",
"\n",
"clear_all_button = widgets.Button(\n",
" description=\"Clear all\",\n",
" disabled=False,\n",
" button_style=\"\", # 'success', 'info', 'warning', 'danger' or ''\n",
" tooltip=\"Clear current queries\",\n",
" icon=\"\",\n",
")\n",
"\n",
"get_data_button = widgets.Button(\n",
" description=\"Create chart\",\n",
" disabled=False,\n",
" button_style=\"primary\", # 'success', 'info', 'warning', 'danger' or ''\n",
" tooltip=\"Create chart\",\n",
" icon=\"\",\n",
")\n",
"\n",
"save_chart_button = widgets.Button(\n",
" description=\"Save chart\",\n",
" disabled=False,\n",
" button_style=\"primary\", # 'success', 'info', 'warning', 'danger' or ''\n",
" tooltip=\"Save chart as HTML\",\n",
" icon=\"\",\n",
")\n",
"\n",
"save_chart_width = widgets.BoundedIntText(\n",
" value=700, min=700, max=2000, step=100, description=\"Width\", disabled=False\n",
")\n",
"\n",
"save_chart_height = widgets.BoundedIntText(\n",
" value=400, min=400, max=1500, step=100, description=\"Height\", disabled=False\n",
")\n",
"\n",
"clear_all_button.on_click(clear_all)\n",
"get_data_button.on_click(get_data)\n",
"save_chart_button.on_click(save_chart)\n",
"tab1 = widgets.VBox([widgets.HBox([query, query_button]), query_tip, out])\n",
"tab2 = widgets.VBox([state_query, states, states_tip])\n",
"tab3 = widgets.VBox([title_query, widgets.HBox([titles, titles_button]), titles_tip])\n",
"\n",
"tab = widgets.Tab(children=[tab1, tab2, tab3])\n",
"tab.set_title(0, \"Compare queries\")\n",
"tab.set_title(1, \"Compare states\")\n",
"tab.set_title(2, \"Compare newspapers\")\n",
"display(\n",
" widgets.VBox(\n",
" [tab, widgets.HBox([get_data_button, clear_all_button]), results, save_data]\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# TESTING\n",
"\n",
"if os.getenv(\"GW_STATUS\") == \"dev\" and os.getenv(\"TROVE_API_KEY\"):\n",
" api_key.value = os.getenv(\"TROVE_API_KEY\")\n",
" query.value = \"cat\"\n",
" query_button.click()\n",
" get_data_button.click()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"----\n",
"\n",
"Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.github.io/). \n",
"Support this project by becoming a [GitHub sponsor](https://github.com/sponsors/wragge?o=esb)."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 4
}