{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# QueryPic deconstructed\n",
    "#### Visualise searches in Trove's digitised newspapers"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[QueryPic](http://dhistory.org/querypic/) is a tool I created many years ago to visualise searches in Trove's digitised newspapers. It shows you the number of articles each year that match your query — instead of a page of search results, you see the complete result set. You can look for patterns and trends across time.\n",
    "\n",
    "This is a deconstructed, extended, and hackable version of QueryPic."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from requests.exceptions import HTTPError, Timeout\n",
    "import os\n",
    "import ipywidgets as widgets\n",
    "from operator import itemgetter # used for sorting\n",
    "import pandas as pd # makes manipulating the data easier\n",
    "import altair as alt\n",
    "from requests.adapters import HTTPAdapter\n",
    "from requests.packages.urllib3.util.retry import Retry\n",
    "from tqdm import tqdm_notebook\n",
    "from IPython.display import display, HTML, FileLink, clear_output\n",
    "import math\n",
    "from collections import OrderedDict\n",
    "import time\n",
    "\n",
    "# Make sure data directory exists\n",
    "os.makedirs('data', exist_ok=True)\n",
    "\n",
    "# Set up Altair\n",
    "alt.renderers.enable('notebook')\n",
    "\n",
    "# Create a session that will automatically retry on server errors\n",
    "s = requests.Session()\n",
    "retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])\n",
    "s.mount('http://', HTTPAdapter(max_retries=retries))\n",
    "s.mount('https://', HTTPAdapter(max_retries=retries))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Enter your Trove API key\n",
    "\n",
    "Get your own [Trove API key](http://help.nla.gov.au/trove/building-with-trove/api) and enter it below."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "api_key = widgets.Text(\n",
    "    placeholder='Enter your Trove API key',\n",
    "    description='API key:',\n",
    "    disabled=False\n",
    ")\n",
    "display(api_key)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "params = {\n",
    "    'q': ' ', # A space to search for everything\n",
    "    'facet': 'year',\n",
    "    'zone': 'newspaper',\n",
    "    'l-category': 'Article',\n",
    "    'encoding': 'json',\n",
    "    'n': 0\n",
    "}\n",
    "\n",
    "results = widgets.Output()\n",
    "save_data = widgets.Output()\n",
    "df = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_results(params):\n",
    "    '''\n",
    "    Get JSON response data from the Trove API.\n",
    "    Parameters:\n",
    "        params\n",
    "    Returns:\n",
    "        JSON formatted response data from Trove API \n",
    "    '''\n",
    "    response = s.get('https://api.trove.nla.gov.au/v2/result', params=params, timeout=30)\n",
    "    response.raise_for_status()\n",
    "    # print(response.url) # This shows us the url that's sent to the API\n",
    "    data = response.json()\n",
    "    return data\n",
    "\n",
    "def get_facets(data):\n",
    "    '''\n",
    "    Loop through facets in Trove API response, saving terms and counts.\n",
    "    Parameters:\n",
    "        data  - JSON formatted response data from Trove API  \n",
    "    Returns:\n",
    "        A list of dictionaries containing: 'year', 'total_results'\n",
    "    '''\n",
    "    facets = []\n",
    "    try:\n",
    "        for term in data['response']['zone'][0]['facets']['facet']['term']:\n",
    "            if int(term['display']) >= date_range.value[0] and int(term['display']) <= date_range.value[1]:\n",
    "                facets.append({'year': int(term['display']), 'total_results': int(term['count'])})\n",
    "        facets.sort(key=itemgetter('year'))\n",
    "    except TypeError:\n",
    "        pass\n",
    "    return facets\n",
    "\n",
    "def combine_totals(query_data, total_data):\n",
    "    '''\n",
    "    Take facets data from the query search and a blank search (ie everything) for a decade and combine them.\n",
    "    Parameters:\n",
    "        query_data    - list of dictionaries containing facets data from a query search\n",
    "        total_data    - list of dictionaries containing facets data from a blank search\n",
    "    Returns:\n",
    "        A list of dictionaries containing: 'year', 'total_results', 'total articles' \n",
    "    '''\n",
    "    combined_data = []\n",
    "    query_data = get_facets(query_data)\n",
    "    total_data = get_facets(total_data)\n",
    "    for index, query_row in enumerate(query_data):\n",
    "        total_row = total_data[index]\n",
    "        query_row['total_articles'] = total_row['total_results']\n",
    "        combined_data.append(query_row)\n",
    "    return combined_data \n",
    "\n",
    "def year_totals(params):\n",
    "    '''\n",
    "    Generate a dataset for a search query.\n",
    "    Parameters:\n",
    "        query    - search query\n",
    "    Returns:\n",
    "        A Pandas dataframe with three columns -- year, total_results, total_articles -- and one row per year.\n",
    "    '''\n",
    "    totals = []\n",
    "    start_decade = math.floor(date_range.value[0] / 10)\n",
    "    end_decade = math.floor(date_range.value[1] / 10) + 1\n",
    "    query = params['q']\n",
    "    with results:\n",
    "        for decade in tqdm_notebook(range(start_decade, end_decade)):\n",
    "            params['l-decade'] = decade\n",
    "            params['q'] = query\n",
    "            query_data = get_results(params)\n",
    "            params['q'] = ' '\n",
    "            total_data = get_results(params)\n",
    "            combined_data = combine_totals(query_data, total_data)\n",
    "            totals.extend(combined_data)\n",
    "    totals.sort(key=itemgetter('year'))\n",
    "    return totals\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Set a date range"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "date_range = widgets.IntRangeSlider(\n",
    "    value=[1803, 1954],\n",
    "    min=1803,\n",
    "    max=2018,\n",
    "    step=1,\n",
    "    description='Date range:',\n",
    "    disabled=False,\n",
    "    continuous_update=False,\n",
    "    orientation='horizontal',\n",
    "    readout=True,\n",
    "    readout_format='0<4d',\n",
    "    layout=widgets.Layout(width='50%')\n",
    ")\n",
    "display(date_range)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Add your search queries\n",
    "\n",
    "You can just add a single search query to see how the number of matching articles vary over time. But you can also compare frequencies between queries, states, and newspapers:\n",
    "\n",
    "*   Compare queries — `cat` vs `dog`\n",
    "*   Compare states — `swimmers` in NSW, Victoria, and Queensland\n",
    "*   Compare newspapers — `protectionism` in *The Age* vs *The Argus*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "queries = []\n",
    "out = widgets.Output()\n",
    "\n",
    "@out.capture()\n",
    "def add_query(b):\n",
    "    queries.append(query.value)\n",
    "    query.value = ''\n",
    "    print('Query {}: {}'.format(len(queries), queries[-1]))\n",
    "\n",
    "query = widgets.Text(\n",
    "        placeholder='Enter your query then click the button to add',\n",
    "        disabled=False,\n",
    "    )\n",
    "\n",
    "query_button = widgets.Button(\n",
    "        description='Add query',\n",
    "        disabled=False,\n",
    "        tooltip='Click to add query',\n",
    "        icon=''\n",
    "    )\n",
    "\n",
    "query_button.on_click(add_query)\n",
    "query_tip = widgets.HTML(value='A query can be anything you\\'d enter in the Trove simple search box — from a single keyword to a complex boolean expression. Add as many queries as you want.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_titles(b):\n",
    "    params = {\n",
    "    'encoding': 'json',\n",
    "    'key': api_key.value\n",
    "    }\n",
    "    response = requests.get('http://api.trove.nla.gov.au/v2/newspaper/titles', params=params)\n",
    "    data = response.json()\n",
    "    title_list = [(t['title'], {'id': t['id'], 'title': t['title']}) for t in data['response']['records']['newspaper']]\n",
    "    title_list.sort(key=itemgetter(0))\n",
    "    titles_sorted = OrderedDict(title_list)\n",
    "    titles.options = titles_sorted\n",
    "    \n",
    "title_query = widgets.Text(\n",
    "        placeholder='Enter your query',\n",
    "        description='Search for:',\n",
    "        disabled=False,\n",
    "    )\n",
    "titles = widgets.SelectMultiple(\n",
    "        options=['Click on button to load titles'],\n",
    "        rows=10,\n",
    "        description='In:',\n",
    "        disabled=False,\n",
    "        layout=widgets.Layout(width='50%')\n",
    "    )\n",
    "titles_button = widgets.Button(\n",
    "        description='Load titles',\n",
    "        disabled=False,\n",
    "        button_style='', # 'success', 'info', 'warning', 'danger' or ''\n",
    "        tooltip='Click to load titles',\n",
    "        icon=''\n",
    "    )\n",
    "titles_button.on_click(get_titles)\n",
    "titles_tip = widgets.HTML(value='Use <b>Shift</b> or <b>Cmd/Ctrl</b> to select multiple newspapers to compare.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "state_query = widgets.Text(\n",
    "        placeholder='Enter your query',\n",
    "        description='Search for:',\n",
    "        disabled=False,\n",
    "    )\n",
    "    \n",
    "states = widgets.SelectMultiple(\n",
    "    options=[\n",
    "            'ACT',\n",
    "            'New South Wales',\n",
    "            'Queensland',\n",
    "            'South Australia',\n",
    "            'Northern Territory',\n",
    "            'Tasmania',\n",
    "            'Victoria',\n",
    "            'Western Australia',\n",
    "            'National',\n",
    "            'International'\n",
    "    ],\n",
    "    rows=10,\n",
    "    description='In:',\n",
    "    disabled=False,\n",
    "    layout=widgets.Layout(width='50%')\n",
    ")\n",
    "\n",
    "states_tip = widgets.HTML(value='Use <b>Shift</b> or <b>Cmd/Ctrl</b> to select multiple states to compare.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_raw_results(width=700, height=400):\n",
    "    chart = alt.Chart(df).mark_line(point=True).encode(\n",
    "        x=alt.X('year:Q', axis=alt.Axis(format='c', title='Year')),\n",
    "        y=alt.Y('total_results:Q', axis=alt.Axis(format=',d', title='Number of articles')),\n",
    "        color=alt.Color('query', legend=alt.Legend(title='')),\n",
    "        tooltip=[alt.Tooltip('query', title='Query:'), alt.Tooltip('year:Q', title='Year'), alt.Tooltip('total_results:Q', title='Articles', format=',')]\n",
    "    ).properties(width=width, height=height).interactive()\n",
    "    return chart\n",
    "\n",
    "def plot_relative_results(width=700, height=400):\n",
    "    chart = alt.Chart(df).mark_line(point=True).encode(\n",
    "        x=alt.X('year:Q', axis=alt.Axis(format='c', title='Year')),\n",
    "        y=alt.Y('PercentOfTotal:Q', axis=alt.Axis(format='.2%', title='Percentage of total articles')),\n",
    "        color=alt.Color('query', legend=alt.Legend(title='')),\n",
    "        tooltip=[alt.Tooltip('query', title='Query:'), alt.Tooltip('year:Q', title='Year'), alt.Tooltip('PercentOfTotal:Q', title='Articles', format='.2%')]\n",
    "    ).properties(width=width, height=height).transform_calculate(\n",
    "        PercentOfTotal=\"datum.total_results / datum.total_articles\"\n",
    "    ).interactive()\n",
    "    return chart"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clear_all(b):\n",
    "    states.value = []\n",
    "    state_query.value = ''\n",
    "    titles.value = []\n",
    "    title_query.value = ''\n",
    "    out.clear_output()\n",
    "    queries.clear()\n",
    "    results.clear_output()\n",
    "\n",
    "def get_data(b):\n",
    "    global df\n",
    "    results.clear_output()\n",
    "    traces = []\n",
    "    q_params = params.copy()\n",
    "    q_params['key'] = api_key.value\n",
    "    if tab.selected_index == 0:\n",
    "        for query in queries:\n",
    "            q_params['q'] = query\n",
    "            with results:\n",
    "                display(HTML('Searching for {}...'.format(query)))\n",
    "            totals = year_totals(q_params.copy())\n",
    "            df_totals = pd.DataFrame(totals)\n",
    "            df_totals['query'] = query\n",
    "            traces.append(df_totals)\n",
    "    elif tab.selected_index == 1:\n",
    "        q_params['q'] = state_query.value\n",
    "        for state in states.value:\n",
    "            q_params['l-state'] = state\n",
    "            with results:\n",
    "                display(HTML('Searching in {}...'.format(state)))\n",
    "            totals = year_totals(q_params.copy())\n",
    "            df_totals = pd.DataFrame(totals)\n",
    "            df_totals['query'] = state\n",
    "            traces.append(df_totals)\n",
    "    elif tab.selected_index == 2:\n",
    "        q_params['q'] = title_query.value\n",
    "        for title in titles.value:\n",
    "            q_params['l-title'] = title['id']\n",
    "            with results:\n",
    "                display(HTML('Searching in {}...'.format(title['title'])))\n",
    "            totals = year_totals(q_params.copy())\n",
    "            df_totals = pd.DataFrame(totals)\n",
    "            df_totals['query'] = title['title']\n",
    "            traces.append(df_totals)\n",
    "    try:\n",
    "        df = pd.concat(traces, ignore_index=True)\n",
    "    except ValueError:\n",
    "        with results:\n",
    "            display(HTML('No results!'))\n",
    "    else:\n",
    "        results.clear_output(wait=True)\n",
    "        chart = plot_relative_results()\n",
    "        chart_type.value = 'proportion'\n",
    "        csv_file = save_as_csv()\n",
    "        with results:\n",
    "            display(chart_type)\n",
    "            display(chart)\n",
    "        with save_data:\n",
    "            display(HTML('Download data:'), FileLink(csv_file))\n",
    "            display(widgets.HBox([save_chart_button, save_chart_width, save_chart_height]))\n",
    "\n",
    "def save_chart(b):\n",
    "    width = save_chart_width.value\n",
    "    height = save_chart_height.value\n",
    "    if chart_type.value == 'proportion':\n",
    "        chart = plot_relative_results(width, height)\n",
    "    else:\n",
    "        chart = plot_raw_results(width, height)\n",
    "    filename = 'data/querypic-{}.html'.format(int(time.time()))\n",
    "    chart.save(filename)\n",
    "    display(HTML('View HTML version:'), FileLink(filename))\n",
    "    \n",
    "        \n",
    "def save_as_csv():\n",
    "    filename = 'data/querypic-{}.csv'.format(int(time.time()))\n",
    "    df.to_csv(filename, index=False)\n",
    "    return filename\n",
    "\n",
    "def change_chart(o):\n",
    "    results.clear_output(wait=True)\n",
    "    if chart_type.value == 'proportion':\n",
    "        chart = plot_relative_results()\n",
    "    else:\n",
    "        chart = plot_raw_results()\n",
    "    with results:\n",
    "        display(chart_type)\n",
    "        display(chart)\n",
    "\n",
    "chart_type = widgets.Dropdown(\n",
    "        options=[('Raw number of results', 'raw'), ('Proportion of total articles', 'proportion')],\n",
    "        value='proportion'\n",
    "    )\n",
    "\n",
    "chart_type.observe(change_chart)\n",
    "    \n",
    "clear_all_button = widgets.Button(\n",
    "        description='Clear all',\n",
    "        disabled=False,\n",
    "        button_style='', # 'success', 'info', 'warning', 'danger' or ''\n",
    "        tooltip='Clear current queries',\n",
    "        icon=''\n",
    "    )\n",
    "\n",
    "get_data_button = widgets.Button(\n",
    "        description='Create chart',\n",
    "        disabled=False,\n",
    "        button_style='primary', # 'success', 'info', 'warning', 'danger' or ''\n",
    "        tooltip='Create chart',\n",
    "        icon=''\n",
    "    )\n",
    "\n",
    "save_chart_button = widgets.Button(\n",
    "        description='Save chart',\n",
    "        disabled=False,\n",
    "        button_style='primary', # 'success', 'info', 'warning', 'danger' or ''\n",
    "        tooltip='Save chart as HTML',\n",
    "        icon=''\n",
    "    )\n",
    "\n",
    "save_chart_width = widgets.BoundedIntText(\n",
    "    value=700,\n",
    "    min=700,\n",
    "    max=2000,\n",
    "    step=100,\n",
    "    description='Width',\n",
    "    disabled=False\n",
    ")\n",
    "\n",
    "save_chart_height = widgets.BoundedIntText(\n",
    "    value=400,\n",
    "    min=400,\n",
    "    max=1500,\n",
    "    step=100,\n",
    "    description='Height',\n",
    "    disabled=False\n",
    ")\n",
    "\n",
    "clear_all_button.on_click(clear_all)\n",
    "get_data_button.on_click(get_data)\n",
    "save_chart_button.on_click(save_chart)\n",
    "tab1 = widgets.VBox([widgets.HBox([query, query_button]), query_tip, out])\n",
    "tab2 = widgets.VBox([state_query, states, states_tip])\n",
    "tab3 = widgets.VBox([title_query, widgets.HBox([titles, titles_button]), titles_tip])\n",
    "\n",
    "tab = widgets.Tab(children=[tab1, tab2, tab3])\n",
    "tab.set_title(0, 'Compare queries')\n",
    "tab.set_title(1, 'Compare states')\n",
    "tab.set_title(2, 'Compare newspapers')\n",
    "display(widgets.VBox([tab, widgets.HBox([get_data_button, clear_all_button]), results, save_data]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}