{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Convert a year's worth of Historic Hansard into a dataframe for analysis\n", "\n", "This notebook analyses Commonwealth Hansard XML files [from this GitHub repository](https://github.com/wragge/hansard-xml). Give it a `year` (between 1901 and 1980), and a `house` (either 'hofreps' or 'senate'), and it will download all the proceedings of that year and house, extract some basic data about debates and speeches, and provide the results as a dataframe for exploration." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import requests\n", "import requests_cache\n", "from bs4 import BeautifulSoup\n", "from requests.adapters import HTTPAdapter\n", "from requests.packages.urllib3.util.retry import Retry\n", "from tqdm.auto import tqdm\n", "import arrow\n", "import pandas as pd\n", "import altair as alt\n", "\n", "s = requests_cache.CachedSession()\n", "retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])\n", "s.mount('https://', HTTPAdapter(max_retries=retries))\n", "s.mount('http://', HTTPAdapter(max_retries=retries))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that the GitHub API only allows 60 unauthorised requests per hour. So it's a good idea to cache things. Note that requests to download files aren't included in the API tally. If you need more requests you'll need to use authentication." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "API_URL = 'https://api.github.com/repos/wragge/hansard-xml/contents'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Set the year and house you're interested in." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "year = '1901' # 1901 to 1980\n", "house = 'hofreps' # hofreps or senate" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def count_words(para):\n", " '''\n", " Count the number of words in an element.\n", " '''\n", " words = 0\n", " for string in para.stripped_strings:\n", " words += len(string.split())\n", " return words\n", "\n", "def get_paras(section):\n", " '''\n", " Find all the para type containers in an element and count the total number of words.\n", " '''\n", " words = 0\n", " for para in section.find_all(['para', 'quote', 'list'], recursive=False):\n", " words += count_words(para)\n", " return words\n", "\n", "def get_words_in_speech(start, speech):\n", " '''\n", " Get the top-level containers in a speech and find the total number of words across them all.\n", " '''\n", " words = 0\n", " words += get_paras(start)\n", " words += get_paras(speech)\n", " for cont in speech.find_all('continue', recursive=False):\n", " cont_start = cont.find('talk.start', recursive=False)\n", " words += get_paras(cont_start)\n", " words += get_paras(cont)\n", " return words\n", " \n", "def get_interjections(speech):\n", " '''\n", " Get details of any interjections within a speech.\n", " '''\n", " speeches = []\n", " for index, intj in enumerate(speech.find_all('interjection', recursive=False)):\n", " start = intj.find('talk.start', recursive=False)\n", " speaker = start.find('talker')\n", " name = speaker.find('name', role='metadata').string\n", " id = speaker.find('name.id').string\n", " words = get_words_in_speech(start, intj)\n", " speeches.append({'interjection_idx': index, 'speaker': name, 'id': id, 'type': intj.name, 'words': words})\n", " return speeches \n", "\n", "def get_speeches(debate):\n", " '''\n", " Get details of any speeches in a debate (or subdebate)\n", " '''\n", " speeches = []\n", " for index, speech in enumerate(debate.find_all(['speech', 'question', 'answer'], recursive=False)):\n", " start = speech.find('talk.start', recursive=False)\n", " speaker = start.find('talker')\n", " name = speaker.find('name', role='metadata').string\n", " id = speaker.find('name.id').string\n", " words = get_words_in_speech(start, speech)\n", " speeches.append({'speech_idx': index, 'speaker': name, 'id': id, 'type': speech.name, 'words': words})\n", " # Interjections are within a speech\n", " interjections = get_interjections(speech)\n", " # Tag interjections with the speech index\n", " for intj in interjections:\n", " intj['speech_idx'] = index\n", " speeches.append(intj)\n", " return speeches\n", "\n", "def get_subdebates(debate):\n", " '''\n", " Get details of any subdebates within a debate.\n", " '''\n", " speeches = []\n", " for index, sub in enumerate(debate.find_all('subdebate.1', recursive=False)):\n", " subdebate_info = {'subdebate_title': sub.subdebateinfo.title.string, 'subdebate_idx': index}\n", " new_speeches = get_speeches(sub)\n", " # Add the subdebate info to the speech\n", " for sp in new_speeches:\n", " sp.update(subdebate_info)\n", " speeches += new_speeches\n", " return speeches\n", "\n", "def get_debates(soup):\n", " '''\n", " Get details of all the debates in day's proceedings.\n", " '''\n", " speeches = []\n", " date = soup.find('session.header').date.string\n", " for index, debate in enumerate(soup.find_all('debate')):\n", " debate_info = {\n", " 'date': date,\n", " 'debate_title': debate.debateinfo.title.string,\n", " 'debate_type': debate.debateinfo.type.string,\n", " 'debate_idx': index\n", " }\n", " new_speeches = get_subdebates(debate)\n", " new_speeches += get_speeches(debate)\n", " # Add the debate info to the speech\n", " for sp in new_speeches:\n", " sp.update(debate_info)\n", " speeches += new_speeches\n", " return speeches\n", "\n", "def summarise_year(year, house):\n", " '''\n", " Get each day's proceedings for the supplied year/house and extract information about debates and speeches.\n", " '''\n", " speeches = []\n", " response = s.get(f'{API_URL}/{house}/{year}')\n", " data = response.json()\n", " files = [f for f in data if f['type'] == 'file']\n", " for f in tqdm(files):\n", " response = s.get(f['download_url'])\n", " soup = BeautifulSoup(response.text)\n", " speeches += get_debates(soup)\n", " df = pd.DataFrame(speeches)\n", " return df" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a6e6f9bc5ece4d369387947ea4e06b1b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(FloatProgress(value=0.0, max=113.0), HTML(value='')))" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "df = summarise_year(year=year, house=house)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
speech_idxspeakeridtypewordssubdebate_titlesubdebate_idxdatedebate_titledebate_typedebate_idxinterjection_idx
00MACDONALD-PATERSON, ThomasKIQspeech318HIS EXCELLENCY THE GOVER0.01901-05-09QUESTIONQuestions9NaN
11BRADDON, EdwardJRRspeech178HIS EXCELLENCY THE GOVER0.01901-05-09QUESTIONQuestions9NaN
22SMITH, ArthurKTTspeech693HIS EXCELLENCY THE GOVER0.01901-05-09QUESTIONQuestions9NaN
32CHAPMAN, AustinJX7interjection9HIS EXCELLENCY THE GOVER0.01901-05-09QUESTIONQuestions90.0
43CAMERON, Donald NormanJUJspeech98HIS EXCELLENCY THE GOVER0.01901-05-09QUESTIONQuestions9NaN
\n", "
" ], "text/plain": [ " speech_idx speaker id type words \\\n", "0 0 MACDONALD-PATERSON, Thomas KIQ speech 318 \n", "1 1 BRADDON, Edward JRR speech 178 \n", "2 2 SMITH, Arthur KTT speech 693 \n", "3 2 CHAPMAN, Austin JX7 interjection 9 \n", "4 3 CAMERON, Donald Norman JUJ speech 98 \n", "\n", " subdebate_title subdebate_idx date debate_title \\\n", "0 HIS EXCELLENCY THE GOVER 0.0 1901-05-09 QUESTION \n", "1 HIS EXCELLENCY THE GOVER 0.0 1901-05-09 QUESTION \n", "2 HIS EXCELLENCY THE GOVER 0.0 1901-05-09 QUESTION \n", "3 HIS EXCELLENCY THE GOVER 0.0 1901-05-09 QUESTION \n", "4 HIS EXCELLENCY THE GOVER 0.0 1901-05-09 QUESTION \n", "\n", " debate_type debate_idx interjection_idx \n", "0 Questions 9 NaN \n", "1 Questions 9 NaN \n", "2 Questions 9 NaN \n", "3 Questions 9 0.0 \n", "4 Questions 9 NaN " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Who made the most speeches?" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "BARTON, Edmund 439\n", "KINGSTON, Charles 303\n", "MCMILLAN, William 215\n", "DEAKIN, Alfred 204\n", "CONROY, Alfred 180\n", "PIESSE, Frederick 166\n", "THOMSON, Dugald 153\n", "WATSON, John Christian 150\n", "REID, George 146\n", "ISAACS, Isaac 146\n", "GLYNN, Patrick 140\n", "SPEAKER, Mr 140\n", "CROUCH, Richard 136\n", "O'MALLEY, King 119\n", "MCCAY, James 118\n", "MCEACHARN, Malcolm 115\n", "MAUGER, Samuel 109\n", "LYNE, William 108\n", "POYNTON, Alexander 108\n", "TURNER, George 107\n", "Name: speaker, dtype: int64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[df['type'] == 'speech']['speaker'].value_counts()[:20]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Who made the most interjections?" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "KINGSTON, Charles 1257\n", "DEAKIN, Alfred 1097\n", "BARTON, Edmund 1001\n", "TURNER, George 906\n", "REID, George 801\n", "MCMILLAN, William 775\n", "MAUGER, Samuel 604\n", "LYNE, William 551\n", "WATSON, John Christian 550\n", "COOK, Joseph 536\n", "HIGGINS, Henry 535\n", "ISAACS, Isaac 482\n", "MCEACHARN, Malcolm 429\n", "THOMSON, Dugald 391\n", "CONROY, Alfred 355\n", "MCCAY, James 355\n", "FORREST, John 332\n", "SOLOMON, Vaiben 321\n", "POYNTON, Alexander 300\n", "MCDONALD, Charles 284\n", "Name: speaker, dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[df['type'] == 'interjection']['speaker'].value_counts()[:20]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Who spoke the most words?" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
speakerwords
2BARTON, Edmund201547
65REID, George140732
55MCMILLAN, William138382
41KINGSTON, Charles132851
74SPEAKER, Mr128840
78THOMSON, Dugald112445
18DEAKIN, Alfred104408
82WATSON, John Christian99848
49MCCAY, James98219
12CONROY, Alfred97755
80TURNER, George94780
21EDWARDS, George93070
39ISAACS, Isaac91439
35HIGGINS, Henry90842
64QUICK, John88777
14COOK, Joseph88317
62PIESSE, Frederick86988
73SOLOMON, Vaiben86977
29GLYNN, Patrick83018
84WILKS, William81424
\n", "
" ], "text/plain": [ " speaker words\n", "2 BARTON, Edmund 201547\n", "65 REID, George 140732\n", "55 MCMILLAN, William 138382\n", "41 KINGSTON, Charles 132851\n", "74 SPEAKER, Mr 128840\n", "78 THOMSON, Dugald 112445\n", "18 DEAKIN, Alfred 104408\n", "82 WATSON, John Christian 99848\n", "49 MCCAY, James 98219\n", "12 CONROY, Alfred 97755\n", "80 TURNER, George 94780\n", "21 EDWARDS, George 93070\n", "39 ISAACS, Isaac 91439\n", "35 HIGGINS, Henry 90842\n", "64 QUICK, John 88777\n", "14 COOK, Joseph 88317\n", "62 PIESSE, Frederick 86988\n", "73 SOLOMON, Vaiben 86977\n", "29 GLYNN, Patrick 83018\n", "84 WILKS, William 81424" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby(by='speaker')['words'].sum().to_frame().reset_index().sort_values('words', ascending=False)[:20]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Which debates generated the most words?\n", "\n", "Note that there's variation in the way debate titles were recorded, and in the OCR results, so this sort of grouping isn't always going to work. To get something more accurate, you'd have to do some normalisation of debate titles first." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
debate_titlewords
111QUESTION1084980
74MOTION OF CENSURE488836
96POST AND TELEGRAPH BILL334188
31CUSTOMS BILL303111
58IMMIGRATION RESTRICTION BILL301900
109PUBLIC SERVICE BILL260357
135TARIFF174766
35DEFENCE BILL136801
129SUPPLY BILL92487
81PACIFIC ISLANDS LABOURERS BILL86791
24COMMONWEALTH PUBLIC SERVICE BILL86225
38DISTILLATION BILL83656
5ADJOURNMENT79587
18BRITISH NEW GUINEA64175
60INTER-STATE COMMISSION BILL50156
55GOVERNOR-GENERAL'S SPEECH47909
125STATEMENT SHOWING THE AMOUNTS RECEIVE!) BV WHI...44751
19BUDGET43095
105PROPERTY FOR PUBLIC PURPOSES ACQUISITION BILL34402
0ACTS INTERPRETATION BILL33693
\n", "
" ], "text/plain": [ " debate_title words\n", "111 QUESTION 1084980\n", "74 MOTION OF CENSURE 488836\n", "96 POST AND TELEGRAPH BILL 334188\n", "31 CUSTOMS BILL 303111\n", "58 IMMIGRATION RESTRICTION BILL 301900\n", "109 PUBLIC SERVICE BILL 260357\n", "135 TARIFF 174766\n", "35 DEFENCE BILL 136801\n", "129 SUPPLY BILL 92487\n", "81 PACIFIC ISLANDS LABOURERS BILL 86791\n", "24 COMMONWEALTH PUBLIC SERVICE BILL 86225\n", "38 DISTILLATION BILL 83656\n", "5 ADJOURNMENT 79587\n", "18 BRITISH NEW GUINEA 64175\n", "60 INTER-STATE COMMISSION BILL 50156\n", "55 GOVERNOR-GENERAL'S SPEECH 47909\n", "125 STATEMENT SHOWING THE AMOUNTS RECEIVE!) BV WHI... 44751\n", "19 BUDGET 43095\n", "105 PROPERTY FOR PUBLIC PURPOSES ACQUISITION BILL 34402\n", "0 ACTS INTERPRETATION BILL 33693" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby(by=['debate_title'])['words'].sum().to_frame().reset_index().sort_values('words', ascending=False)[:20]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## How many words were spoken each day of proceedings?\n", "\n", "I've only included words in speeches with identified speakers (including interjections), so some procedural content might not be included in the totals." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "words_per_day = df.groupby(by=['date'])['words'].sum().to_frame().reset_index()\n", "alt.Chart(words_per_day).mark_bar(size=2).encode(\n", " x='date:T',\n", " y='words:Q',\n", " tooltip=['date:T', 'words:Q']\n", ").properties(width=700)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Most popular topics of questions" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "TARIFF 1161\n", "THE TARIFF 887\n", "THE GOVERNOR-GENERAL'S SPEECH 415\n", "MOTION OFCENSURE 347\n", "G OVERNOR - GENERAL'S SPEECH 277\n", "WEAVERS' PRICES AT THE ANTIPODES 259\n", "SUPPLY 232\n", "GOVERNOR-GENERAL'S SPEECH 219\n", "EDMUND BARTON 193\n", "WESTERN AUSTRALIAN MONEYORDER OFFICERS 167\n", "EMOLUMENTS OF MINISTERS 166\n", "JOHN JOSEPH EA STICK 125\n", "THEGOVERNOR-GENERAL'S SPEECH 64\n", "OLD-AGE PENSIONS 63\n", "WAYS AND MEANS 59\n", "RATE OF WAGE : HOURS OF LABOUR 41\n", "FEDERAL CAPITAL SITE 33\n", "THIRD SCHEDULE 32\n", "ADDITIONAL SITTING DAY 27\n", "DEPARTMENT OF AGRICULTURE 24\n", "Name: subdebate_title, dtype: int64" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[(df['debate_type'] == 'Questions') | (df['debate_title'] == 'QUESTION') | (df['type'] == 'question')]['subdebate_title'].value_counts()[:20]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "----\n", "\n", "Created by [Tim Sherratt](https://timsherratt.org) for the [GLAM Workbench](https://glam-workbench.github.io/)." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }