{ "cells": [ { "cell_type": "markdown", "id": "alien-arrow", "metadata": {}, "source": [ "# QueryPic DigitalNZ\n", "\n", "#### Visualise searches in Papers Past newspapers\n", "\n", "QueryPic helps you explore your search results in Papers Past by showing you how they change over time – aggregating the number of articles matching your query by year.\n", "\n", "" ] }, { "cell_type": "code", "execution_count": null, "id": "representative-equilibrium", "metadata": {}, "outputs": [], "source": [ "%%capture\n", "import requests\n", "from requests.adapters import HTTPAdapter\n", "from requests.packages.urllib3.util.retry import Retry\n", "import requests_cache\n", "import pandas as pd\n", "import altair as alt\n", "import arrow\n", "import os\n", "from IPython.display import display, HTML\n", "import ipywidgets as widgets\n", "\n", "# Make sure data directory exists\n", "os.makedirs('data', exist_ok=True)\n", "\n", "# Create a session that will automatically retry on server errors\n", "s = requests_cache.CachedSession('querypic', expire_after=60*60)\n", "retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])\n", "s.mount('http://', HTTPAdapter(max_retries=retries))\n", "s.mount('https://', HTTPAdapter(max_retries=retries))\n", "\n", "# CONFIG SO THAT ALTAIR HREFS OPEN IN A NEW TAB\n", "\n", "def blank_href():\n", " return {\n", " \"usermeta\": {\n", " \"embedOptions\": {\n", " 'loader': {'target': '_blank'}\n", " }\n", " }\n", " }\n", "\n", "\n", "# register the custom theme under a chosen name\n", "alt.themes.register('blank_href', blank_href)\n", "\n", "# enable the newly registered theme\n", "alt.themes.enable('blank_href')" ] }, { "cell_type": "code", "execution_count": null, "id": "distant-kuwait", "metadata": {}, "outputs": [], "source": [ "dfs = []\n", "queries = []\n", "titles = [\n", " 'All',\n", " 'Akaroa Mail and Banks Peninsula Advertiser',\n", " 'Albertland Gazette',\n", " 'Ashburton Guardian',\n", " 'Auckland Star',\n", " 'Bruce Herald',\n", " 'Bush Advocate',\n", " 'Clutha Leader',\n", " 'Colonist',\n", " 'Daily Southern Cross',\n", " 'Daily Telegraph',\n", " 'Ellesmere Guardian',\n", " 'Evening Post',\n", " 'Fair Play',\n", " 'Feilding Star',\n", " 'Grey River Argus',\n", " 'Hawera & Normanby Star',\n", " \"Hawke's Bay Herald\",\n", " \"Hawke's Bay Weekly Times\",\n", " 'Hutt News',\n", " 'Inangahua Times',\n", " 'Kai Tiaki',\n", " 'Kaipara and Waitemata Echo',\n", " 'Lyttelton Times',\n", " 'Manawatu Herald',\n", " 'Manawatu Standard',\n", " 'Manawatu Times',\n", " 'Marlborough Express',\n", " 'Mataura Ensign',\n", " 'NZ Truth',\n", " 'Nelson Evening Mail',\n", " 'New Zealand Advertiser and Bay of Islands Gazette',\n", " 'New Zealand Colonist and Port Nicholson Advertiser',\n", " 'New Zealand Free Lance',\n", " 'New Zealand Gazette and Wellington Spectator',\n", " 'New Zealand Illustrated Magazine',\n", " \"New Zealand Spectator and Cook's Strait Guardian\",\n", " 'New Zealand Tablet',\n", " 'New Zealander',\n", " 'North Otago Times',\n", " 'Northern Advocate',\n", " 'Observer',\n", " 'Ohinemuri Gazette',\n", " 'Otago Daily Times',\n", " 'Otago Witness',\n", " 'Otautau Standard and Wallace County Chronicle',\n", " 'Oxford Observer',\n", " 'Papers Past',\n", " 'Poverty Bay Herald',\n", " 'Progress',\n", " 'Rodney and Otamatea Times, Waitemata and Kaipara Gazette',\n", " 'Southland Times',\n", " 'Star',\n", " 'Taranaki Daily News',\n", " 'Taranaki Herald',\n", " 'Te Aroha News',\n", " 'Thames Star',\n", " 'Timaru Herald',\n", " 'Tuapeka Times',\n", " 'Waiapu Church Gazette',\n", " 'Waiapu Church Times',\n", " 'Waikato Times',\n", " 'Waimate Daily Advertiser',\n", " 'Wairarapa Daily Times',\n", " 'Wanganui Chronicle',\n", " 'Wanganui Herald',\n", " 'Wellington Independent',\n", " 'West Coast Times'\n", "]\n", "\n", "start_year = 1839\n", "end_year = 1945" ] }, { "cell_type": "code", "execution_count": null, "id": "ethical-swiss", "metadata": {}, "outputs": [], "source": [ "def get_titles_and_years():\n", " params = {\n", " 'api_key': api_key.value,\n", " 'text': '',\n", " 'and[display_collection][]': 'Papers Past',\n", " 'facets': 'collection,year',\n", " 'facets_per_page': 350\n", " }\n", " response = s.get('http://api.digitalnz.org/v3/records.json', params=params)\n", " data = response.json()\n", " titles = sorted(list(data['search']['facets']['collection'].keys()))\n", " titles.insert(0, 'All')\n", " years = sorted(list(data['search']['facets']['year'].keys()))\n", " return titles, years\n", "\n", "def get_data(query=''):\n", " params = {\n", " 'api_key': api_key.value,\n", " 'text': query,\n", " 'and[display_collection][]': 'Papers Past',\n", " 'facets': 'year',\n", " 'facets_per_page': 350\n", " }\n", " if select_newspaper.value != 'All':\n", " params['and[collection][]'] = select_newspaper.value\n", " response = s.get('http://api.digitalnz.org/v3/records.json', params=params)\n", " data = response.json()\n", " return data\n", "\n", "def fill_year_gaps(df, min_year=None, max_year=None):\n", " df.set_index('year', inplace=True)\n", " if not min_year:\n", " min_year = int(df.index.min())\n", " max_year = int(df.index.max())\n", " idx = sorted(list(range(min_year, max_year + 1)))\n", " df = df.reindex(idx).reset_index()\n", " return df, min_year, max_year\n", "\n", "def create_year_df(data, col_name='total_articles', min_year=None, max_year=None):\n", " years = data['search']['facets']['year']\n", " df = pd.Series(years).to_frame().reset_index()\n", " df.columns = ['year', col_name]\n", " df['year'] = df['year'].astype('int64')\n", " df, min_year, max_year = fill_year_gaps(df)\n", " # Add search url\n", " # years_df['url'] = 'https://paperspast.natlib.govt.nz/newspapers?query={0}&start_date=01-01-{1}&end_date=31-12-{1}'.format(params['text'], years_df['year'][0])\n", " return df, min_year, max_year\n", "\n", "def prepare_data(b):\n", " global dfs, queries\n", " query_id = f'Query {len(queries) + 1}'\n", " if select_newspaper.value == 'All':\n", " query_url = f'https://digitalnz.org/records?text={query.value}&i[primary_collection]=Papers%20Past'\n", " else:\n", " query_url = f'https://digitalnz.org/records?text={query.value}&i[primary_collection]=Papers%20Past&i[collection]={select_newspaper.value}'\n", " queries.append({'x': 0, 'y': len(queries), 'id': query_id, 'url': query_url})\n", " totals = get_data()\n", " totals_df, min_year, max_year = create_year_df(totals)\n", " years = get_data(query.value)\n", " try:\n", " years_df, _, _ = create_year_df(years, 'total_results')\n", " except ValueError:\n", " with results:\n", " display('No results')\n", " else:\n", " df = pd.merge(totals_df, years_df, how='left', on='year')\n", " df['query_id'] = query_id\n", " df['url'] = df['year'].apply(lambda x: f'{query_url}&i[year]={x}')\n", " dfs.append(df.loc[(df['year'] >= int(date_range.value[0])) & (df['year'] <= int(date_range.value[1]))])\n", " show_results()\n", " \n", "def show_results(view='raw'):\n", " '''\n", " Display the chart and the save data options.\n", " '''\n", " results.clear_output(wait=True)\n", " save_data.clear_output(wait=True)\n", " chart = make_chart(view=view)\n", " chart_type.unobserve(change_chart, 'value')\n", " chart_type.value = 'raw'\n", " chart_type.observe(change_chart, 'value')\n", " csv_file = save_as_csv()\n", " with results:\n", " display(chart_type)\n", " display(chart)\n", " with save_data:\n", " display(widgets.HBox([save_chart_button, save_chart_width, save_chart_height]), layout=widgets.Layout(margin='50px 0 50px 0'))\n", " display(HTML(f'Download data: {csv_file}'))\n", " \n", "def list_queries():\n", " '''\n", " Creates a text-based chart that lists the saved queries.\n", " '''\n", " df = pd.DataFrame(queries)\n", " chart = alt.Chart(df).mark_text(align='left', dx=2, dy=1, baseline='middle').encode(\n", " x=alt.X('x:Q', title=None, axis=None, scale=alt.Scale(range=[0,1])),\n", " y=alt.Y('id:O', title=None, axis=alt.Axis(labelFontWeight='bold', domain=False, grid=False)),\n", " text='url:N',\n", " href='url',\n", " color=alt.value('blue')\n", " )\n", " return chart\n", " \n", "def make_chart(view, width=800, height=400):\n", " df = pd.concat(dfs, ignore_index=True)\n", " if view == 'raw':\n", " y = alt.Y('total_results:Q', title='Number of results')\n", " else:\n", " y = alt.Y('ratio:Q', axis=alt.Axis(format='.2%'), title='Percentage of total articles')\n", " plot = alt.Chart(df, width=600).mark_line(point=True, interpolate='cardinal'\n", " ).transform_calculate(\n", " ratio='datum.total_results / datum.total_articles'\n", " ).encode(\n", " x = alt.X('year:O', title='Year'),\n", " y = y,\n", " color = alt.Color('query_id', legend=alt.Legend(title='Query')),\n", " tooltip = [\n", " alt.Tooltip('query_id:N', title='Query'), \n", " alt.Tooltip('year:O', title='Year'), \n", " alt.Tooltip('total_results', format=',', title='Number of results'), \n", " alt.Tooltip('ratio:Q', format='.2%', title='Percentage of articles')\n", " ],\n", " href='url:N'\n", " ).properties(\n", " height=height,\n", " width=width,\n", " title={\n", " 'text': 'Papers Past Newspapers Search (via DigitalNZ)',\n", " 'subtitle': f'Created by QueryPic: {arrow.now().format(\"D MMMM YYYY\")}'\n", " }\n", " )\n", " \n", " # Create text chart listing queries\n", " query_list = list_queries()\n", " # Combine charts\n", " chart = alt.vconcat(plot, query_list).configure(padding=20\n", " ).configure_view(\n", " strokeWidth=0\n", " ).configure_title(\n", " fontSize=14\n", " )\n", " '''\n", " c2 = alt.Chart(titles_df[1:11]).mark_bar().encode(\n", " x = 'count:Q',\n", " y = 'title:O',\n", " tooltip = alt.Tooltip('count', format=',')\n", " ).properties(\n", " height=300,\n", " width=200\n", " )\n", " '''\n", " return chart\n", " \n", "def change_chart(o):\n", " '''\n", " Switch between chart views.\n", " '''\n", " results.clear_output()\n", " if chart_type.value == 'proportion':\n", " view = 'relative'\n", " else:\n", " view = 'raw'\n", " chart = make_chart(view)\n", " # chart_type.value = view\n", " with results:\n", " display(chart_type)\n", " display(chart)\n", " \n", "def clear_all(b):\n", " '''\n", " Clear all queries and results.\n", " '''\n", " global dfs, queries\n", " dfs = []\n", " queries = []\n", " query.value = ''\n", " select_newspaper.value = 'All'\n", " date_range.value = [start_year, end_year]\n", " results.clear_output()\n", " save_data.clear_output()\n", " \n", "def clear_last(b):\n", " '''\n", " Remove the most recent query from the chart.\n", " '''\n", " global dfs, queries\n", " results.clear_output()\n", " save_data.clear_output()\n", " dfs.pop()\n", " queries.pop()\n", " if dfs:\n", " show_results()\n", " \n", "def save_chart(b):\n", " '''\n", " Save the chart as HTML for download.\n", " '''\n", " width = save_chart_width.value\n", " height = save_chart_height.value\n", " if chart_type.value == 'proportion':\n", " chart = make_chart('relative', width, height)\n", " else:\n", " chart = make_chart('raw', width, height)\n", " filename = f'data/querypic-{arrow.now().format(\"YYYYMMDDHHmmss\")}.html'\n", " chart.save(filename)\n", " with save_data:\n", " display(HTML(f'Download HTML version: {filename}'))\n", "\n", "def save_as_csv():\n", " '''\n", " Save harvested data as a CSV for download.\n", " '''\n", " df = pd.concat(dfs, ignore_index=True)\n", " filename = f'data/querypic-{arrow.now().format(\"YYYYMMDDHHmmss\")}.csv'\n", " df.to_csv(filename, index=False)\n", " return filename" ] }, { "cell_type": "code", "execution_count": null, "id": "owned-marshall", "metadata": {}, "outputs": [], "source": [ "#titles, years = get_titles_and_years()\n", "#start_year = int(years[0])\n", "#end_year = int(years[-1])\n", "\n", "api_key = widgets.Password(\n", " placeholder='Enter your DigitalNZ API key',\n", " description='API key:',\n", " disabled=False,\n", " value=''\n", ")\n", "\n", "query = widgets.Text(\n", " value='',\n", " placeholder='enter your search query',\n", " description='Keywords:',\n", " disabled=False\n", ")\n", "\n", "chart_type = widgets.Dropdown(\n", " options=[('Raw number of results', 'raw'), ('Proportion of total articles', 'proportion')],\n", " value='raw'\n", ")\n", "\n", "chart_type.observe(change_chart, 'value')\n", "\n", "search_button = widgets.Button(\n", " description='Create chart',\n", " disabled=False,\n", " button_style='primary', # 'success', 'info', 'warning', 'danger' or ''\n", " tooltip='Create a chart',\n", " icon=''\n", " )\n", "\n", "select_newspaper = widgets.Dropdown(\n", " description='Newspaper:',\n", " options=titles,\n", " value='All'\n", ")\n", "\n", "date_range = widgets.IntRangeSlider(\n", " value=[start_year, end_year],\n", " min=start_year,\n", " max=end_year,\n", " step=1,\n", " description='Date range:',\n", " disabled=False,\n", " continuous_update=False,\n", " orientation='horizontal',\n", " readout=True,\n", " readout_format='0<4d',\n", " layout=widgets.Layout(width='50%')\n", ")\n", "\n", "clear_last_button = widgets.Button(\n", " description='Remove last query',\n", " disabled=False,\n", " button_style='', # 'success', 'info', 'warning', 'danger' or ''\n", " tooltip='Remove the last query',\n", " icon=''\n", " )\n", " \n", "clear_all_button = widgets.Button(\n", " description='Clear all queries',\n", " disabled=False,\n", " button_style='', # 'success', 'info', 'warning', 'danger' or ''\n", " tooltip='Clear current queries',\n", " icon='',\n", " \n", " )\n", "\n", "save_chart_button = widgets.Button(\n", " description='Save chart as HTML',\n", " disabled=False,\n", " button_style='primary', # 'success', 'info', 'warning', 'danger' or ''\n", " tooltip='Save chart as HTML',\n", " icon=''\n", " )\n", "\n", "save_chart_width = widgets.BoundedIntText(\n", " value=700,\n", " min=700,\n", " max=2000,\n", " step=100,\n", " description='Width',\n", " disabled=False\n", ")\n", "\n", "save_chart_height = widgets.BoundedIntText(\n", " value=400,\n", " min=400,\n", " max=1500,\n", " step=100,\n", " description='Height',\n", " disabled=False\n", ")\n", "\n", "results = widgets.Output()\n", "save_data = widgets.Output()\n", "\n", "search_button.on_click(prepare_data)\n", "clear_last_button.on_click(clear_last)\n", "clear_all_button.on_click(clear_all)\n", "save_chart_button.on_click(save_chart)\n", "\n", "display(api_key)\n", "display(widgets.HBox([query, select_newspaper]))\n", "display(date_range)\n", "display(widgets.VBox([widgets.HBox([search_button, clear_last_button, clear_all_button], layout=widgets.Layout(margin='20px 0 20px 0')), results, save_data]))" ] }, { "cell_type": "markdown", "id": "30b1912f-53fc-4ee5-abc0-48faf867fec2", "metadata": {}, "source": [ "----\n", "\n", "Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.net/). Support this project by becoming a [GitHub sponsor](https://github.com/sponsors/wragge?o=esb)." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" }, "voila": { "template": "material" } }, "nbformat": 4, "nbformat_minor": 5 }