{ "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Warning! Code ahead!" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "#### Some setting up...\n", "\n", "Eeek! We're going to run **live** code during this presentation. To run a code cell (like the one below), just hover over it and then click on the play icon that pops up in the left margin. Do this for every code cell you see!" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "import requests\n", "import pandas as pd\n", "import json\n", "import altair as alt\n", "from tqdm import tnrange, trange\n", "import folium\n", "from folium.plugins import HeatMapWithTime\n", "from IPython.display import display, HTML\n", "api_key = 'ju3rgk0jp354ikmh'\n", "alt.renderers.enable('notebook')" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### Tell me Trove, how many newspaper articles do you have about 'influenza'?" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "params = {\n", " 'q': 'influenza',\n", " 'zone': 'newspaper',\n", " 'encoding': 'json',\n", " 'facet': 'state',\n", " 'n': '1',\n", " 'key': api_key\n", "}\n", "response = requests.get('http://api.trove.nla.gov.au/v2/result', params=params)\n", "data = response.json()\n", "total = int(data['response']['zone'][0]['records']['total'])\n", "display(HTML('

There are {:,} articles Tim.

'.format(total)))" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### Can you show the number of results in each state?" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "# Make a map\n", "facets = data['response']['zone'][0]['facets']['facet']['term']\n", "df = pd.DataFrame(facets)\n", "df['count'] = pd.to_numeric(df['count'], errors='coerce')\n", "df = df.replace('ACT', 'Australian Capital Territory')\n", "with open('data/aus_state.geojson', \"r\") as geo_file:\n", " geo_data = json.load(geo_file)\n", "c1 = alt.Chart(alt.Data(values=geo_data['features'])\n", " ).mark_geoshape(stroke='black', strokeWidth=0.2\n", " ).encode(color=alt.Color('count:Q', scale=alt.Scale(scheme='greenblue'), legend=alt.Legend(title='Total articles'))\n", " ).transform_lookup(lookup='properties.STATE_NAME', from_=alt.LookupData(df, 'display', ['count'])\n", " ).project(type='mercator'\n", " ).properties(width=600, height=400)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "# Display the map\n", "c1" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### Can you break them down by category?" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "# Get some data\n", "params = {\n", " 'q': 'influenza',\n", " 'zone': 'newspaper',\n", " 'encoding': 'json',\n", " 'facet': 'category',\n", " 'n': '1',\n", " 'key': api_key\n", "}\n", "response = requests.get('http://api.trove.nla.gov.au/v2/result', params=params)\n", "data = response.json()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "# Make a chart\n", "facets = data['response']['zone'][0]['facets']['facet']['term']\n", "df = pd.DataFrame(facets)\n", "df['count'] = pd.to_numeric(df['count'], errors='coerce')\n", "c2 = alt.Chart(df).mark_bar().encode(\n", " x=alt.X('count:Q', title='Number of articles'),\n", " y=alt.Y('display:N', title='Category'),\n", " tooltip=['count:Q']\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "# Display the chart\n", "c2" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### Can you show change over time?" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "# Get some data\n", "start = 188\n", "end = 194\n", "years = []\n", "params = {\n", " 'q': 'influenza', # A space to search for everything\n", " 'facet': 'year',\n", " 'zone': 'newspaper', \n", " 'l-category': 'Article',\n", " 'key': api_key,\n", " 'encoding': 'json',\n", " 'n': 0\n", "}\n", "for decade in trange(start, end):\n", " params['l-decade'] = decade\n", " response = requests.get('http://api.trove.nla.gov.au/v2/result', params=params)\n", " data = response.json()\n", " years += data['response']['zone'][0]['facets']['facet']['term']\n", "df = pd.DataFrame(years)\n", "df.head()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "# Make a chart\n", "df['display'] = pd.to_datetime(df['display'], format='%Y', errors='coerce')\n", "c3 = alt.Chart(df).mark_line().encode(\n", " x='year(display):T',\n", " y='count:Q',\n", " tooltip=[alt.Tooltip('year(display):T', format='%Y', title='Year'), alt.Tooltip('count:Q', title='Articles')]\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "# Display the chart\n", "c3" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### But how do we know there just weren't more newspapers published in 1919?\n", "\n", "We could try dividing the number of results by the total number of articles published that year..." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "# Get some data\n", "params['q'] = ' '\n", "years = []\n", "for decade in trange(start, end):\n", " params['l-decade'] = decade\n", " response = requests.get('http://api.trove.nla.gov.au/v2/result', params=params)\n", " data = response.json()\n", " years += data['response']['zone'][0]['facets']['facet']['term']\n", "df2 = pd.DataFrame(years) \n", "df2['display'] = pd.to_datetime(df['display'], format='%Y', errors='coerce')\n", "merged = pd.merge(df, df2, on='display')\n", "merged['count_x'] = pd.to_numeric(merged['count_x'], errors='coerce')\n", "merged['count_y'] = pd.to_numeric(merged['count_y'], errors='coerce')\n", "merged['proportion'] = merged['count_x'] / merged['count_y']\n", "merged.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "# Make a chart\n", "c4 = alt.Chart(merged).mark_line().encode(\n", " x='year(display):T',\n", " y='proportion:Q',\n", " tooltip=[alt.Tooltip('year(display):T', title='Year'), alt.Tooltip('proportion:Q', title='Proportion')]\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "# Display the chart\n", "c4" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### Let's focus on 1917 to 1919, and look at *where* articles were published..." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "# Set things up\n", "locations = pd.read_csv('data/trove-newspaper-titles-locations.csv', names=['title_id', 'title', 'state', 'place_id', 'place', 'lat', 'lon'])\n", "locations.drop_duplicates(subset=['title_id'], keep='first', inplace=True)\n", "hm_series = []\n", "time_index = []\n", "start = 1918\n", "end = 1919\n", "params = {\n", " 'q': 'influenza', # A space to search for everything\n", " 'facet': 'title',\n", " 'l-category': 'Article',\n", " 'zone': 'newspaper', \n", " 'key': api_key,\n", " 'encoding': 'json',\n", " 'n': 0\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "# Get some data\n", "for year in range(start, end+1):\n", " params['l-year'] = year\n", " for month in trange(1, 13):\n", " params['l-month'] = month\n", " time_index.append('{}-{}'.format(year, month))\n", " response = requests.get('http://api.trove.nla.gov.au/v2/result', params=params)\n", " data = response.json()\n", " facets = data['response']['zone'][0]['facets']['facet']['term']\n", " df = pd.DataFrame(facets)\n", " df = df[['display', 'count']]\n", " df.columns = ['title_id', 'total']\n", " df['total'] = pd.to_numeric(df['total'], errors='coerce')\n", " df_located = pd.merge(df, locations, on='title_id', how='left')\n", " df_totals = df_located.groupby(['place', 'lat', 'lon']).sum()\n", " hm_data = []\n", " for place in df_totals.index:\n", " total = df_totals.loc[place]['total']\n", " hm_data += ([[place[1], place[2]]] * total)\n", " hm_series.append(hm_data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "# Make a map\n", "m1 = folium.Map(\n", " location=[-30, 135],\n", " zoom_start=4\n", ")\n", "\n", "#Add the heatmap data!\n", "HeatMapWithTime(\n", " hm_series,\n", " index=time_index,\n", " radius = 10,\n", " auto_play=True\n", ").add_to(m1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "# Show the map\n", "m1" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Digging deeper\n", "\n", "* Harvest datasets for further analysis\n", "* [Trove Newspaper Harvester](trove/Using-TroveHarvester-to-get-newspaper-articles-in-bulk.ipynb)\n", "* RecordSearch Series Harvester in [OzGLAM-Workbench](https://github.com/GLAM-Workbench/ozglam-workbench)\n" ] } ], "metadata": { "celltoolbar": "Slideshow", "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" }, "livereveal": { "autolaunch": true } }, "nbformat": 4, "nbformat_minor": 2 }