{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Convert a year's worth of Historic Hansard into a dataframe for analysis\n",
    "\n",
    "This notebook analyses Commonwealth Hansard XML files [from this GitHub repository](https://github.com/wragge/hansard-xml). Give it a `year` (between 1901 and 1980), and a `house` (either 'hofreps' or 'senate'), and it will download all the proceedings of that year and house, extract some basic data about debates and speeches, and provide the results as a dataframe for exploration."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import requests_cache\n",
    "from bs4 import BeautifulSoup\n",
    "from requests.adapters import HTTPAdapter\n",
    "from requests.packages.urllib3.util.retry import Retry\n",
    "from tqdm.auto import tqdm\n",
    "import arrow\n",
    "import pandas as pd\n",
    "import altair as alt\n",
    "\n",
    "s = requests_cache.CachedSession()\n",
    "retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])\n",
    "s.mount('https://', HTTPAdapter(max_retries=retries))\n",
    "s.mount('http://', HTTPAdapter(max_retries=retries))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note that the GitHub API only allows 60 unauthorised requests per hour. So it's a good idea to cache things. Note that requests to download files aren't included in the API tally. If you need more requests you'll need to use authentication."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "API_URL = 'https://api.github.com/repos/wragge/hansard-xml/contents'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Set the year and house you're interested in."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "year = '1901' # 1901 to 1980\n",
    "house = 'hofreps' # hofreps or senate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def count_words(para):\n",
    "    '''\n",
    "    Count the number of words in an element.\n",
    "    '''\n",
    "    words = 0\n",
    "    for string in para.stripped_strings:\n",
    "        words += len(string.split())\n",
    "    return words\n",
    "\n",
    "def get_paras(section):\n",
    "    '''\n",
    "    Find all the para type containers in an element and count the total number of words.\n",
    "    '''\n",
    "    words = 0\n",
    "    for para in section.find_all(['para', 'quote', 'list'], recursive=False):\n",
    "        words += count_words(para)\n",
    "    return words\n",
    "\n",
    "def get_words_in_speech(start, speech):\n",
    "    '''\n",
    "    Get the top-level containers in a speech and find the total number of words across them all.\n",
    "    '''\n",
    "    words = 0\n",
    "    words += get_paras(start)\n",
    "    words += get_paras(speech)\n",
    "    for cont in speech.find_all('continue', recursive=False):\n",
    "        cont_start = cont.find('talk.start', recursive=False)\n",
    "        words += get_paras(cont_start)\n",
    "        words += get_paras(cont)\n",
    "    return words\n",
    "                            \n",
    "def get_interjections(speech):\n",
    "    '''\n",
    "    Get details of any interjections within a speech.\n",
    "    '''\n",
    "    speeches = []\n",
    "    for index, intj in enumerate(speech.find_all('interjection', recursive=False)):\n",
    "        start = intj.find('talk.start', recursive=False)\n",
    "        speaker = start.find('talker')\n",
    "        name = speaker.find('name', role='metadata').string\n",
    "        id = speaker.find('name.id').string\n",
    "        words = get_words_in_speech(start, intj)\n",
    "        speeches.append({'interjection_idx': index, 'speaker': name, 'id': id, 'type': intj.name, 'words': words})\n",
    "    return speeches     \n",
    "\n",
    "def get_speeches(debate):\n",
    "    '''\n",
    "    Get details of any speeches in a debate (or subdebate)\n",
    "    '''\n",
    "    speeches = []\n",
    "    for index, speech in enumerate(debate.find_all(['speech', 'question', 'answer'], recursive=False)):\n",
    "        start = speech.find('talk.start', recursive=False)\n",
    "        speaker = start.find('talker')\n",
    "        name = speaker.find('name', role='metadata').string\n",
    "        id = speaker.find('name.id').string\n",
    "        words = get_words_in_speech(start, speech)\n",
    "        speeches.append({'speech_idx': index, 'speaker': name, 'id': id, 'type': speech.name, 'words': words})\n",
    "        # Interjections are within a speech\n",
    "        interjections = get_interjections(speech)\n",
    "        # Tag interjections with the speech index\n",
    "        for intj in interjections:\n",
    "            intj['speech_idx'] = index\n",
    "            speeches.append(intj)\n",
    "    return speeches\n",
    "\n",
    "def get_subdebates(debate):\n",
    "    '''\n",
    "    Get details of any subdebates within a debate.\n",
    "    '''\n",
    "    speeches = []\n",
    "    for index, sub in enumerate(debate.find_all('subdebate.1', recursive=False)):\n",
    "        subdebate_info = {'subdebate_title': sub.subdebateinfo.title.string, 'subdebate_idx': index}\n",
    "        new_speeches = get_speeches(sub)\n",
    "        # Add the subdebate info to the speech\n",
    "        for sp in new_speeches:\n",
    "            sp.update(subdebate_info)\n",
    "        speeches += new_speeches\n",
    "    return speeches\n",
    "\n",
    "def get_debates(soup):\n",
    "    '''\n",
    "    Get details of all the debates in day's proceedings.\n",
    "    '''\n",
    "    speeches = []\n",
    "    date = soup.find('session.header').date.string\n",
    "    for index, debate in enumerate(soup.find_all('debate')):\n",
    "        debate_info = {\n",
    "            'date': date,\n",
    "            'debate_title': debate.debateinfo.title.string,\n",
    "            'debate_type': debate.debateinfo.type.string,\n",
    "            'debate_idx': index\n",
    "        }\n",
    "        new_speeches = get_subdebates(debate)\n",
    "        new_speeches += get_speeches(debate)\n",
    "        # Add the debate info to the speech\n",
    "        for sp in new_speeches:\n",
    "            sp.update(debate_info)\n",
    "        speeches += new_speeches\n",
    "    return speeches\n",
    "\n",
    "def summarise_year(year, house):\n",
    "    '''\n",
    "    Get each day's proceedings for the supplied year/house and extract information about debates and speeches.\n",
    "    '''\n",
    "    speeches = []\n",
    "    response = s.get(f'{API_URL}/{house}/{year}')\n",
    "    data = response.json()\n",
    "    files = [f for f in data if f['type'] == 'file']\n",
    "    for f in tqdm(files):\n",
    "        response = s.get(f['download_url'])\n",
    "        soup = BeautifulSoup(response.text)\n",
    "        speeches += get_debates(soup)\n",
    "    df = pd.DataFrame(speeches)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a6e6f9bc5ece4d369387947ea4e06b1b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, max=113.0), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "df = summarise_year(year=year, house=house)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>speech_idx</th>\n",
       "      <th>speaker</th>\n",
       "      <th>id</th>\n",
       "      <th>type</th>\n",
       "      <th>words</th>\n",
       "      <th>subdebate_title</th>\n",
       "      <th>subdebate_idx</th>\n",
       "      <th>date</th>\n",
       "      <th>debate_title</th>\n",
       "      <th>debate_type</th>\n",
       "      <th>debate_idx</th>\n",
       "      <th>interjection_idx</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>MACDONALD-PATERSON, Thomas</td>\n",
       "      <td>KIQ</td>\n",
       "      <td>speech</td>\n",
       "      <td>318</td>\n",
       "      <td>HIS EXCELLENCY THE GOVER</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1901-05-09</td>\n",
       "      <td>QUESTION</td>\n",
       "      <td>Questions</td>\n",
       "      <td>9</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>BRADDON, Edward</td>\n",
       "      <td>JRR</td>\n",
       "      <td>speech</td>\n",
       "      <td>178</td>\n",
       "      <td>HIS EXCELLENCY THE GOVER</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1901-05-09</td>\n",
       "      <td>QUESTION</td>\n",
       "      <td>Questions</td>\n",
       "      <td>9</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>SMITH, Arthur</td>\n",
       "      <td>KTT</td>\n",
       "      <td>speech</td>\n",
       "      <td>693</td>\n",
       "      <td>HIS EXCELLENCY THE GOVER</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1901-05-09</td>\n",
       "      <td>QUESTION</td>\n",
       "      <td>Questions</td>\n",
       "      <td>9</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>CHAPMAN, Austin</td>\n",
       "      <td>JX7</td>\n",
       "      <td>interjection</td>\n",
       "      <td>9</td>\n",
       "      <td>HIS EXCELLENCY THE GOVER</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1901-05-09</td>\n",
       "      <td>QUESTION</td>\n",
       "      <td>Questions</td>\n",
       "      <td>9</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3</td>\n",
       "      <td>CAMERON, Donald Norman</td>\n",
       "      <td>JUJ</td>\n",
       "      <td>speech</td>\n",
       "      <td>98</td>\n",
       "      <td>HIS EXCELLENCY THE GOVER</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1901-05-09</td>\n",
       "      <td>QUESTION</td>\n",
       "      <td>Questions</td>\n",
       "      <td>9</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   speech_idx                     speaker   id          type  words  \\\n",
       "0           0  MACDONALD-PATERSON, Thomas  KIQ        speech    318   \n",
       "1           1             BRADDON, Edward  JRR        speech    178   \n",
       "2           2               SMITH, Arthur  KTT        speech    693   \n",
       "3           2             CHAPMAN, Austin  JX7  interjection      9   \n",
       "4           3      CAMERON, Donald Norman  JUJ        speech     98   \n",
       "\n",
       "            subdebate_title  subdebate_idx        date debate_title  \\\n",
       "0  HIS EXCELLENCY THE GOVER            0.0  1901-05-09     QUESTION   \n",
       "1  HIS EXCELLENCY THE GOVER            0.0  1901-05-09     QUESTION   \n",
       "2  HIS EXCELLENCY THE GOVER            0.0  1901-05-09     QUESTION   \n",
       "3  HIS EXCELLENCY THE GOVER            0.0  1901-05-09     QUESTION   \n",
       "4  HIS EXCELLENCY THE GOVER            0.0  1901-05-09     QUESTION   \n",
       "\n",
       "  debate_type  debate_idx  interjection_idx  \n",
       "0   Questions           9               NaN  \n",
       "1   Questions           9               NaN  \n",
       "2   Questions           9               NaN  \n",
       "3   Questions           9               0.0  \n",
       "4   Questions           9               NaN  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Who made the most speeches?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "BARTON, Edmund            439\n",
       "KINGSTON, Charles         303\n",
       "MCMILLAN, William         215\n",
       "DEAKIN, Alfred            204\n",
       "CONROY, Alfred            180\n",
       "PIESSE, Frederick         166\n",
       "THOMSON, Dugald           153\n",
       "WATSON, John Christian    150\n",
       "REID, George              146\n",
       "ISAACS, Isaac             146\n",
       "GLYNN, Patrick            140\n",
       "SPEAKER, Mr               140\n",
       "CROUCH, Richard           136\n",
       "O'MALLEY, King            119\n",
       "MCCAY, James              118\n",
       "MCEACHARN, Malcolm        115\n",
       "MAUGER, Samuel            109\n",
       "LYNE, William             108\n",
       "POYNTON, Alexander        108\n",
       "TURNER, George            107\n",
       "Name: speaker, dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[df['type'] == 'speech']['speaker'].value_counts()[:20]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Who made the most interjections?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "KINGSTON, Charles         1257\n",
       "DEAKIN, Alfred            1097\n",
       "BARTON, Edmund            1001\n",
       "TURNER, George             906\n",
       "REID, George               801\n",
       "MCMILLAN, William          775\n",
       "MAUGER, Samuel             604\n",
       "LYNE, William              551\n",
       "WATSON, John Christian     550\n",
       "COOK, Joseph               536\n",
       "HIGGINS, Henry             535\n",
       "ISAACS, Isaac              482\n",
       "MCEACHARN, Malcolm         429\n",
       "THOMSON, Dugald            391\n",
       "CONROY, Alfred             355\n",
       "MCCAY, James               355\n",
       "FORREST, John              332\n",
       "SOLOMON, Vaiben            321\n",
       "POYNTON, Alexander         300\n",
       "MCDONALD, Charles          284\n",
       "Name: speaker, dtype: int64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[df['type'] == 'interjection']['speaker'].value_counts()[:20]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Who spoke the most words?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>speaker</th>\n",
       "      <th>words</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>BARTON, Edmund</td>\n",
       "      <td>201547</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65</th>\n",
       "      <td>REID, George</td>\n",
       "      <td>140732</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>55</th>\n",
       "      <td>MCMILLAN, William</td>\n",
       "      <td>138382</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>KINGSTON, Charles</td>\n",
       "      <td>132851</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74</th>\n",
       "      <td>SPEAKER, Mr</td>\n",
       "      <td>128840</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>78</th>\n",
       "      <td>THOMSON, Dugald</td>\n",
       "      <td>112445</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>DEAKIN, Alfred</td>\n",
       "      <td>104408</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>82</th>\n",
       "      <td>WATSON, John Christian</td>\n",
       "      <td>99848</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>MCCAY, James</td>\n",
       "      <td>98219</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>CONROY, Alfred</td>\n",
       "      <td>97755</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>80</th>\n",
       "      <td>TURNER, George</td>\n",
       "      <td>94780</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>EDWARDS, George</td>\n",
       "      <td>93070</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>ISAACS, Isaac</td>\n",
       "      <td>91439</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>HIGGINS, Henry</td>\n",
       "      <td>90842</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>64</th>\n",
       "      <td>QUICK, John</td>\n",
       "      <td>88777</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>COOK, Joseph</td>\n",
       "      <td>88317</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>62</th>\n",
       "      <td>PIESSE, Frederick</td>\n",
       "      <td>86988</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>73</th>\n",
       "      <td>SOLOMON, Vaiben</td>\n",
       "      <td>86977</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>GLYNN, Patrick</td>\n",
       "      <td>83018</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>84</th>\n",
       "      <td>WILKS, William</td>\n",
       "      <td>81424</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   speaker   words\n",
       "2           BARTON, Edmund  201547\n",
       "65            REID, George  140732\n",
       "55       MCMILLAN, William  138382\n",
       "41       KINGSTON, Charles  132851\n",
       "74             SPEAKER, Mr  128840\n",
       "78         THOMSON, Dugald  112445\n",
       "18          DEAKIN, Alfred  104408\n",
       "82  WATSON, John Christian   99848\n",
       "49            MCCAY, James   98219\n",
       "12          CONROY, Alfred   97755\n",
       "80          TURNER, George   94780\n",
       "21         EDWARDS, George   93070\n",
       "39           ISAACS, Isaac   91439\n",
       "35          HIGGINS, Henry   90842\n",
       "64             QUICK, John   88777\n",
       "14            COOK, Joseph   88317\n",
       "62       PIESSE, Frederick   86988\n",
       "73         SOLOMON, Vaiben   86977\n",
       "29          GLYNN, Patrick   83018\n",
       "84          WILKS, William   81424"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby(by='speaker')['words'].sum().to_frame().reset_index().sort_values('words', ascending=False)[:20]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Which debates generated the most words?\n",
    "\n",
    "Note that there's variation in the way debate titles were recorded, and in the OCR results, so this sort of grouping isn't always going to work. To get something more accurate, you'd have to do some normalisation of debate titles first."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>debate_title</th>\n",
       "      <th>words</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>111</th>\n",
       "      <td>QUESTION</td>\n",
       "      <td>1084980</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74</th>\n",
       "      <td>MOTION OF CENSURE</td>\n",
       "      <td>488836</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96</th>\n",
       "      <td>POST AND TELEGRAPH BILL</td>\n",
       "      <td>334188</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>CUSTOMS BILL</td>\n",
       "      <td>303111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58</th>\n",
       "      <td>IMMIGRATION RESTRICTION BILL</td>\n",
       "      <td>301900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109</th>\n",
       "      <td>PUBLIC SERVICE BILL</td>\n",
       "      <td>260357</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>135</th>\n",
       "      <td>TARIFF</td>\n",
       "      <td>174766</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>DEFENCE BILL</td>\n",
       "      <td>136801</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>129</th>\n",
       "      <td>SUPPLY BILL</td>\n",
       "      <td>92487</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>81</th>\n",
       "      <td>PACIFIC ISLANDS LABOURERS BILL</td>\n",
       "      <td>86791</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>COMMONWEALTH PUBLIC SERVICE BILL</td>\n",
       "      <td>86225</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>DISTILLATION BILL</td>\n",
       "      <td>83656</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>ADJOURNMENT</td>\n",
       "      <td>79587</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>BRITISH NEW GUINEA</td>\n",
       "      <td>64175</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>60</th>\n",
       "      <td>INTER-STATE COMMISSION BILL</td>\n",
       "      <td>50156</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>55</th>\n",
       "      <td>GOVERNOR-GENERAL'S SPEECH</td>\n",
       "      <td>47909</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>125</th>\n",
       "      <td>STATEMENT SHOWING THE AMOUNTS RECEIVE!) BV WHI...</td>\n",
       "      <td>44751</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>BUDGET</td>\n",
       "      <td>43095</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>PROPERTY FOR PUBLIC PURPOSES ACQUISITION BILL</td>\n",
       "      <td>34402</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ACTS INTERPRETATION BILL</td>\n",
       "      <td>33693</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                          debate_title    words\n",
       "111                                           QUESTION  1084980\n",
       "74                                   MOTION OF CENSURE   488836\n",
       "96                             POST AND TELEGRAPH BILL   334188\n",
       "31                                        CUSTOMS BILL   303111\n",
       "58                        IMMIGRATION RESTRICTION BILL   301900\n",
       "109                                PUBLIC SERVICE BILL   260357\n",
       "135                                             TARIFF   174766\n",
       "35                                        DEFENCE BILL   136801\n",
       "129                                        SUPPLY BILL    92487\n",
       "81                      PACIFIC ISLANDS LABOURERS BILL    86791\n",
       "24                    COMMONWEALTH PUBLIC SERVICE BILL    86225\n",
       "38                                   DISTILLATION BILL    83656\n",
       "5                                          ADJOURNMENT    79587\n",
       "18                                  BRITISH NEW GUINEA    64175\n",
       "60                         INTER-STATE COMMISSION BILL    50156\n",
       "55                           GOVERNOR-GENERAL'S SPEECH    47909\n",
       "125  STATEMENT SHOWING THE AMOUNTS RECEIVE!) BV WHI...    44751\n",
       "19                                              BUDGET    43095\n",
       "105      PROPERTY FOR PUBLIC PURPOSES ACQUISITION BILL    34402\n",
       "0                             ACTS INTERPRETATION BILL    33693"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby(by=['debate_title'])['words'].sum().to_frame().reset_index().sort_values('words', ascending=False)[:20]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## How many words were spoken each day of proceedings?\n",
    "\n",
    "I've only included words in speeches with identified speakers (including interjections), so some procedural content might not be included in the totals."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "<div id=\"altair-viz-61be81e0863a42b186ad976e489e3d50\"></div>\n",
       "<script type=\"text/javascript\">\n",
       "  (function(spec, embedOpt){\n",
       "    let outputDiv = document.currentScript.previousElementSibling;\n",
       "    if (outputDiv.id !== \"altair-viz-61be81e0863a42b186ad976e489e3d50\") {\n",
       "      outputDiv = document.getElementById(\"altair-viz-61be81e0863a42b186ad976e489e3d50\");\n",
       "    }\n",
       "    const paths = {\n",
       "      \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n",
       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm//vega-lib?noext\",\n",
       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm//vega-lite@4.8.1?noext\",\n",
       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm//vega-embed@6?noext\",\n",
       "    };\n",
       "\n",
       "    function loadScript(lib) {\n",
       "      return new Promise(function(resolve, reject) {\n",
       "        var s = document.createElement('script');\n",
       "        s.src = paths[lib];\n",
       "        s.async = true;\n",
       "        s.onload = () => resolve(paths[lib]);\n",
       "        s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
       "        document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
       "      });\n",
       "    }\n",
       "\n",
       "    function showError(err) {\n",
       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
       "      throw err;\n",
       "    }\n",
       "\n",
       "    function displayChart(vegaEmbed) {\n",
       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
       "    }\n",
       "\n",
       "    if(typeof define === \"function\" && define.amd) {\n",
       "      requirejs.config({paths});\n",
       "      require([\"vega-embed\"], displayChart, err => showError(`Error loading script: ${err.message}`));\n",
       "    } else if (typeof vegaEmbed === \"function\") {\n",
       "      displayChart(vegaEmbed);\n",
       "    } else {\n",
       "      loadScript(\"vega\")\n",
       "        .then(() => loadScript(\"vega-lite\"))\n",
       "        .then(() => loadScript(\"vega-embed\"))\n",
       "        .catch(showError)\n",
       "        .then(() => displayChart(vegaEmbed));\n",
       "    }\n",
       "  })({\"config\": {\"view\": {\"continuousWidth\": 400, \"continuousHeight\": 300}}, \"data\": {\"name\": \"data-a05b53b5b36b8c081d674a71889baa16\"}, \"mark\": {\"type\": \"bar\", \"size\": 2}, \"encoding\": {\"tooltip\": [{\"type\": \"temporal\", \"field\": \"date\"}, {\"type\": \"quantitative\", \"field\": \"words\"}], \"x\": {\"type\": \"temporal\", \"field\": \"date\"}, \"y\": {\"type\": \"quantitative\", \"field\": \"words\"}}, \"width\": 700, \"$schema\": \"https://vega.github.io/schema/vega-lite/v4.8.1.json\", \"datasets\": {\"data-a05b53b5b36b8c081d674a71889baa16\": [{\"date\": \"1901-05-09\", \"words\": 5445}, {\"date\": \"1901-05-10\", \"words\": 1115}, {\"date\": \"1901-05-21\", \"words\": 34522}, {\"date\": \"1901-05-22\", \"words\": 48059}, {\"date\": \"1901-05-23\", \"words\": 44380}, {\"date\": \"1901-05-29\", \"words\": 43524}, {\"date\": \"1901-05-30\", \"words\": 47646}, {\"date\": \"1901-05-31\", \"words\": 13378}, {\"date\": \"1901-06-04\", \"words\": 54263}, {\"date\": \"1901-06-05\", \"words\": 45235}, {\"date\": \"1901-06-06\", \"words\": 28067}, {\"date\": \"1901-06-07\", \"words\": 12756}, {\"date\": \"1901-06-11\", \"words\": 49837}, {\"date\": \"1901-06-12\", \"words\": 58318}, {\"date\": \"1901-06-13\", \"words\": 49725}, {\"date\": \"1901-06-14\", \"words\": 16137}, {\"date\": \"1901-06-19\", \"words\": 50877}, {\"date\": \"1901-06-20\", \"words\": 47481}, {\"date\": \"1901-06-21\", \"words\": 14194}, {\"date\": \"1901-06-25\", \"words\": 46772}, {\"date\": \"1901-06-26\", \"words\": 40314}, {\"date\": \"1901-06-27\", \"words\": 44769}, {\"date\": \"1901-06-28\", \"words\": 15529}, {\"date\": \"1901-07-02\", \"words\": 51332}, {\"date\": \"1901-07-03\", \"words\": 45831}, {\"date\": \"1901-07-04\", \"words\": 40116}, {\"date\": \"1901-07-05\", \"words\": 7192}, {\"date\": \"1901-07-09\", \"words\": 51404}, {\"date\": \"1901-07-10\", \"words\": 48616}, {\"date\": \"1901-07-11\", \"words\": 48834}, {\"date\": \"1901-07-12\", \"words\": 13394}, {\"date\": \"1901-07-16\", \"words\": 47683}, {\"date\": \"1901-07-17\", \"words\": 43628}, {\"date\": \"1901-07-18\", \"words\": 44526}, {\"date\": \"1901-07-19\", \"words\": 14356}, {\"date\": \"1901-07-23\", \"words\": 47444}, {\"date\": \"1901-07-24\", \"words\": 46678}, {\"date\": \"1901-07-25\", \"words\": 42723}, {\"date\": \"1901-07-26\", \"words\": 14409}, {\"date\": \"1901-07-30\", \"words\": 49122}, {\"date\": \"1901-07-31\", \"words\": 52406}, {\"date\": \"1901-08-01\", \"words\": 42663}, {\"date\": \"1901-08-02\", \"words\": 13308}, {\"date\": \"1901-08-06\", \"words\": 785}, {\"date\": \"1901-08-07\", \"words\": 43175}, {\"date\": \"1901-08-08\", \"words\": 1549}, {\"date\": \"1901-08-09\", \"words\": 27118}, {\"date\": \"1901-08-13\", \"words\": 39925}, {\"date\": \"1901-08-14\", \"words\": 36857}, {\"date\": \"1901-08-15\", \"words\": 34879}, {\"date\": \"1901-08-16\", \"words\": 7438}, {\"date\": \"1901-08-20\", \"words\": 38935}, {\"date\": \"1901-08-21\", \"words\": 31598}, {\"date\": \"1901-08-22\", \"words\": 42060}, {\"date\": \"1901-08-23\", \"words\": 23693}, {\"date\": \"1901-08-28\", \"words\": 36144}, {\"date\": \"1901-08-29\", \"words\": 2773}, {\"date\": \"1901-08-30\", \"words\": 29342}, {\"date\": \"1901-09-04\", \"words\": 42574}, {\"date\": \"1901-09-05\", \"words\": 26595}, {\"date\": \"1901-09-06\", \"words\": 34988}, {\"date\": \"1901-09-11\", \"words\": 39617}, {\"date\": \"1901-09-12\", \"words\": 48622}, {\"date\": \"1901-09-13\", \"words\": 21139}, {\"date\": \"1901-09-19\", \"words\": 33693}, {\"date\": \"1901-09-20\", \"words\": 22909}, {\"date\": \"1901-09-25\", \"words\": 44321}, {\"date\": \"1901-09-26\", \"words\": 57682}, {\"date\": \"1901-09-27\", \"words\": 23715}, {\"date\": \"1901-10-01\", \"words\": 47319}, {\"date\": \"1901-10-02\", \"words\": 44196}, {\"date\": \"1901-10-03\", \"words\": 32834}, {\"date\": \"1901-10-04\", \"words\": 20405}, {\"date\": \"1901-10-08\", \"words\": 45379}, {\"date\": \"1901-10-09\", \"words\": 45524}, {\"date\": \"1901-10-10\", \"words\": 49533}, {\"date\": \"1901-10-11\", \"words\": 25877}, {\"date\": \"1901-10-15\", \"words\": 45453}, {\"date\": \"1901-10-16\", \"words\": 53362}, {\"date\": \"1901-10-17\", \"words\": 49913}, {\"date\": \"1901-10-18\", \"words\": 30460}, {\"date\": \"1901-10-22\", \"words\": 52826}, {\"date\": \"1901-10-23\", \"words\": 68453}, {\"date\": \"1901-10-24\", \"words\": 56499}, {\"date\": \"1901-10-25\", \"words\": 29582}, {\"date\": \"1901-10-29\", \"words\": 90}, {\"date\": \"1901-10-30\", \"words\": 57339}, {\"date\": \"1901-10-31\", \"words\": 139247}, {\"date\": \"1901-11-05\", \"words\": 22277}, {\"date\": \"1901-11-06\", \"words\": 54441}, {\"date\": \"1901-11-07\", \"words\": 37717}, {\"date\": \"1901-11-08\", \"words\": 15254}, {\"date\": \"1901-11-12\", \"words\": 50219}, {\"date\": \"1901-11-13\", \"words\": 37061}, {\"date\": \"1901-11-14\", \"words\": 30202}, {\"date\": \"1901-11-15\", \"words\": 25364}, {\"date\": \"1901-11-19\", \"words\": 54531}, {\"date\": \"1901-11-20\", \"words\": 34424}, {\"date\": \"1901-11-21\", \"words\": 47187}, {\"date\": \"1901-11-22\", \"words\": 17587}, {\"date\": \"1901-11-26\", \"words\": 41234}, {\"date\": \"1901-11-27\", \"words\": 92711}, {\"date\": \"1901-11-29\", \"words\": 20424}, {\"date\": \"1901-12-02\", \"words\": 40944}, {\"date\": \"1901-12-03\", \"words\": 88457}, {\"date\": \"1901-12-05\", \"words\": 29850}, {\"date\": \"1901-12-06\", \"words\": 20035}, {\"date\": \"1901-12-09\", \"words\": 33240}, {\"date\": \"1901-12-10\", \"words\": 23732}, {\"date\": \"1901-12-11\", \"words\": 37155}, {\"date\": \"1901-12-12\", \"words\": 43213}, {\"date\": \"1901-12-13\", \"words\": 17704}]}}, {\"mode\": \"vega-lite\"});\n",
       "</script>"
      ],
      "text/plain": [
       "alt.Chart(...)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "words_per_day = df.groupby(by=['date'])['words'].sum().to_frame().reset_index()\n",
    "alt.Chart(words_per_day).mark_bar(size=2).encode(\n",
    "    x='date:T',\n",
    "    y='words:Q',\n",
    "    tooltip=['date:T', 'words:Q']\n",
    ").properties(width=700)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Most popular topics of questions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "TARIFF                                    1161\n",
       "THE TARIFF                                 887\n",
       "THE GOVERNOR-GENERAL'S SPEECH              415\n",
       "MOTION OFCENSURE                           347\n",
       "G OVERNOR - GENERAL'S SPEECH               277\n",
       "WEAVERS' PRICES AT THE ANTIPODES           259\n",
       "SUPPLY                                     232\n",
       "GOVERNOR-GENERAL'S SPEECH                  219\n",
       "EDMUND BARTON                              193\n",
       "WESTERN AUSTRALIAN MONEYORDER OFFICERS     167\n",
       "EMOLUMENTS OF MINISTERS                    166\n",
       "JOHN JOSEPH EA STICK                       125\n",
       "THEGOVERNOR-GENERAL'S SPEECH                64\n",
       "OLD-AGE PENSIONS                            63\n",
       "WAYS AND MEANS                              59\n",
       "RATE OF WAGE : HOURS OF LABOUR              41\n",
       "FEDERAL CAPITAL SITE                        33\n",
       "THIRD SCHEDULE                              32\n",
       "ADDITIONAL SITTING DAY                      27\n",
       "DEPARTMENT OF AGRICULTURE                   24\n",
       "Name: subdebate_title, dtype: int64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[(df['debate_type'] == 'Questions') | (df['debate_title'] == 'QUESTION') | (df['type'] == 'question')]['subdebate_title'].value_counts()[:20]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----\n",
    "\n",
    "Created by [Tim Sherratt](https://timsherratt.org) for the [GLAM Workbench](https://glam-workbench.github.io/)."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}