{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Aggregate transcripts\n", "\n", "Depending on how you want to analyse them, it can be useful to group the transcripts by prime minister.\n", "\n", "This notebook aggregates the transcripts in two ways: by extracting the text content of each XML file and combining them into one big text file, and by zipping up the original XML files." ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "from operator import itemgetter, attrgetter\n", "from bs4 import BeautifulSoup\n", "import os\n", "import re\n", "import pandas as pd\n", "import zipfile\n", "\n", "INDEX = \"index.csv\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Combine into one big text file for each PM" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "def combine_pm(pm, release_type=None):\n", " '''\n", " Extract text from the XML files for the specified PM and combine into one big text file.\n", " Can be filtered by 'release_type'.\n", " '''\n", " os.makedirs('pms', exist_ok=True)\n", " transcripts = []\n", " df = pd.read_csv(INDEX, keep_default_na=False)\n", " if release_type:\n", " rows = df.loc[(df['pm'] == pm) & (df['release_type'] == release_type)]\n", " else:\n", " rows = df.loc[(df['pm'] == pm)]\n", " transcript_ids = rows.sort_values(by='date')['id'].to_list()\n", " filename = pm.lower().replace(', ', '-')\n", " if release_type:\n", " filename = '{}-{}'.format(filename, release_type.lower())\n", " with open(os.path.join('pms', filename + '.txt'), 'w') as pm_file:\n", " for t_id in transcript_ids:\n", " with open(os.path.join('transcripts', 'transcript-{}.xml'.format(t_id)), 'rb') as xml_file:\n", " soup = BeautifulSoup(xml_file, 'xml')\n", " content = soup.find('content').string.replace('', '')\n", " clean_content = re.sub('<[^<]+?>', '', content)\n", " pm_file.write(clean_content + '\\n\\n')\n", "\n", "def combine_all_pms(type=None):\n", " df = pd.read_csv(INDEX, keep_default_na=False)\n", " pms = [pm for pm in pd.unique(df['pm']) if pm != '']\n", " for pm in pms:\n", " combine_pm(pm, type)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "combine_all_pms()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# Just get the speeches\n", "combine_all_pms('Speech')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Zip up the transcripts for each PM" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "def zip_pm(pm):\n", " os.makedirs('pms', exist_ok=True)\n", " filename = os.path.join('pms', '{}.zip'.format(pm.lower().replace(', ', '-')))\n", " transcript_ids = df.loc[(df['pm'] == pm)]['id'].to_list()\n", " zf = zipfile.ZipFile(filename, 'w', zipfile.ZIP_DEFLATED)\n", " for t_id in transcript_ids:\n", " t_file = 'transcript-{}.xml'.format(t_id)\n", " t_path = os.path.join('transcripts', t_file)\n", " zf.write(t_path, t_file)\n", " zf.close()\n", " \n", "def zip_all_pms():\n", " df = pd.read_csv(INDEX, keep_default_na=False)\n", " pms = [pm for pm in pd.unique(df['pm']) if pm != '']\n", " for pm in pms:\n", " zip_pm(pm)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/Cellar/python/3.7.3/Frameworks/Python.framework/Versions/3.7/lib/python3.7/zipfile.py:1470: UserWarning: Duplicate name: 'transcript-31836.xml'\n", " return self._open_to_write(zinfo, force_zip64=force_zip64)\n" ] } ], "source": [ "zip_all_pms()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }