{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Create an index to the harvested files\n", "\n", "The XML files contain embedded metadata that includes the name of the prime minister, and the title and date of the transcript. This notebook extracts that metadata from the harvested files and creates a CSV formatted spreadsheet for easy analysis. It also demonstrates some ways of summarising and visualising the metadata." ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DataTransformerRegistry.enable('json')" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os\n", "from bs4 import BeautifulSoup\n", "import arrow\n", "import pandas as pd\n", "import altair as alt\n", "\n", "# Set up Altair\n", "#alt.renderers.enable('notebook')\n", "alt.renderers.enable('default')\n", "alt.data_transformers.enable('json')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Extract the metadata as save as a CSV" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [], "source": [ "def get_tag(soup, tag):\n", " '''\n", " Given some Soup, find the specified tag and return its value.\n", " '''\n", " try:\n", " value = soup.find(tag).string.strip()\n", " except AttributeError:\n", " value = ''\n", " return value\n", "\n", "# Create a list to put the metadata in\n", "all_details = []\n", "\n", "# Get the file names of all the harvested files\n", "files = [f for f in os.listdir('transcripts') if f[-4:] == '.xml']\n", "\n", "# Loop through the harvested files\n", "for filename in files:\n", " \n", " # Open the file\n", " with open(os.path.join('transcripts', filename), 'rb') as xml_file:\n", " \n", " # Create a dict to put this file's metadata in\n", " details = {}\n", " \n", " # Load the file contents into Soup and then get the desired tags\n", " soup = BeautifulSoup(xml_file.read())\n", " details['id'] = get_tag(soup, 'transcript-id')\n", " details['title'] = get_tag(soup, 'title')\n", " details['pm'] = get_tag(soup, 'prime-minister')\n", " \n", " # We're going to reformat the date into the ISO standard, so first get the value\n", " release_date = get_tag(soup, 'release-date')\n", " try:\n", " # Then try to parse the date and reformat as ISO\n", " iso_date = arrow.get(release_date, 'DD/MM/YYYY').format('YYYY-MM-DD')\n", " except:\n", " # If something goes wrong...\n", " iso_date = ''\n", " details['date'] = iso_date\n", " details['release_type'] = get_tag(soup, 'release-type')\n", " details['subjects'] = get_tag(soup, 'subjects')\n", " details['pdf'] = get_tag(soup, 'document')\n", " \n", " # Add the metadata for this file to the list\n", " all_details.append(details)\n", " " ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | date | \n", "id | \n", "pm | \n", "release_type | \n", "subjects | \n", "title | \n", "|
---|---|---|---|---|---|---|---|
0 | \n", "2014-08-24 | \n", "23765 | \n", "\n", " | Abbott, Tony | \n", "Media Release | \n", "\n", " | A message from the Prime Minister - Building a... | \n", "
1 | \n", "1968-03-14 | \n", "1797 | \n", "https://pmtranscripts.pmc.gov.au/sites/default... | \n", "Gorton, John | \n", "Statement in Parliament | \n", "\n", " | STATEMENT BY THE PRIME MINISTER THE RT.HON. JO... | \n", "
2 | \n", "2016-11-17 | \n", "40598 | \n", "\n", " | Turnbull, Malcolm | \n", "Transcript | \n", "\n", " | Press Conference at the launch of the Veterans... | \n", "
3 | \n", "1978-09-27 | \n", "4837 | \n", "https://pmtranscripts.pmc.gov.au/sites/default... | \n", "Fraser, Malcolm | \n", "Media Release | \n", "\n", " | GRANT TO WORLD WILDLIFE FUND AUSTRALIA | \n", "
4 | \n", "2004-03-24 | \n", "21172 | \n", "\n", " | Howard, John | \n", "Interview | \n", "\n", " | Doorstop Interview Great Hall, Parliament Hous... | \n", "