{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Summarise series data\n", "\n", "In this notebook we'll summarise data from all the harvested series." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/vnd.plotly.v1+html": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import os\n", "import pandas as pd\n", "from IPython.core.display import display, HTML\n", "import series_details" ] }, { "cell_type": "code", "execution_count": 152, "metadata": {}, "outputs": [], "source": [ "# This is a list of all the series harvested as part of this repository\n", "series_list = ['B13', 'B6003', 'BP343/15', 'D2860', 'D5036', 'D596', 'E752', 'J2481', 'J2482', 'J2483', 'J3115', 'K1145', 'P437', 'P526', 'PP4/2', 'PP6/1', 'SP11/26', 'SP11/6', 'SP115/1', 'SP115/10', 'SP42/1', 'SP726/1', 'ST84/1']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Aggregate all the series data into a single dataframe\n", "\n", "Let's combine summaries of all the harvested series into a single dataframe so we can look at the big picture." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a list to store the summaries\n", "summaries = []\n", "\n", "# Loop through the list of series in this repo\n", "for series in series_list:\n", " # Open the CSV of each series harvest as a data frame\n", " df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])\n", " # Extract a summary of each series and add it to the list of summaries\n", " summaries.append(series_details.make_summary(series, df, include_titles=False))\n", " \n", "# Convert the list of summaries into a DataFrame for easy manipulation\n", "df = pd.DataFrame(summaries)\n", "\n", "# Flatten the access count dictionaries and fill blanks with zero\n", "df = pd.concat([df, pd.DataFrame((d for idx, d in df['access_counts'].iteritems()))], axis=1).fillna(0)\n", "\n", "# Change access counts from floats to integers\n", "df[['Closed', 'Not yet examined', 'Open with exception', 'Open']] = df[['Closed', 'Not yet examined', 'Open with exception', 'Open']].astype(int)\n", "\n", "# Delete the old 'access_counts' column\n", "del df['access_counts']\n", "\n", "# For convenience acronymise 'Not yet examined' and 'Open with exception'\n", "df.rename({'Not yet examined': 'NYE', 'Open with exception': 'OWE'}, axis=1, inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Display a summary table of all series\n", "\n", "Let's display a summary of each series in a nicely formatted table." ] }, { "cell_type": "code", "execution_count": 153, "metadata": {}, "outputs": [ { "data": { "text/html": [ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
seriestotal_itemsdate_fromdate_toOpenOWENYECloseddigitised_filesdigitised_pages% open% digitised
0B1320,1941800200519,786840003545,04397.98%1.75%
1B6003319041959300000100.00%0.00%
2BP343/152,571191619552,5660508517699.81%3.31%
3D28601190219570100000.00%0.00%
4D5036119061935100000100.00%0.00%
5D59611,395187119712,983318,38101853,03126.18%1.62%
6E752722190519417190307179,31099.58%99.31%
7J2481858189719038580008582,031100.00%100.00%
8J2482799190219127990007983,153100.00%99.87%
9J248314,4381903195614,43602014,43679,21099.99%99.99%
10J3115161189919281610001611,344100.00%100.00%
11K11454,816190019554,791025017587499.48%3.63%
12P4374,958190119404,94510211844299.74%0.36%
13P52621909191810100050.00%0.00%
14PP4/261319031947610030281,51299.51%4.57%
15PP6/16,010190619781,863334,10952456,46131.00%4.08%
16SP11/26271902190227000584100.00%18.52%
17SP11/6191190219471010900132352.88%0.52%
18SP115/11,787188419431,7870009285100.00%0.50%
19SP115/10618841888600000100.00%0.00%
20SP42/116,2561881196015,525073103,25345,86295.50%20.01%
21SP726/1619021959600000100.00%0.00%
22ST84/12,765185519752,75807043413,97999.75%15.70%
" ], "text/plain": [ "" ] }, "execution_count": 153, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get the columns into the order we want\n", "df = df[['series', 'total_items', 'date_from', 'date_to', 'Open', 'OWE', 'NYE', 'Closed', 'digitised_files', 'digitised_pages']]\n", "\n", "# Calculate and add a percentage open column\n", "df['% open'] = df['Open'] / df['total_items']\n", "\n", "# Calculate and add a percentage digitised column\n", "df['% digitised'] = df['digitised_files'] / df['total_items']\n", "\n", "# Add a link to the series name\n", "df['series'] = df['series'].apply(lambda x: '{}'.format(x.replace('/', '-'), x))\n", "\n", "# Style the output\n", "(df.style\n", " .set_properties(**{'font-size': '120%'})\n", " .set_properties(subset=['series'], **{'text-align': 'left', 'font-weight': 'bold'})\n", " .format('{:,}', ['total_items', 'Open', 'OWE', 'NYE', 'Closed', 'digitised_files', 'digitised_pages'])\n", " .format('{:.2%}', ['% open', '% digitised'])\n", " # Hide the index\n", " .set_table_styles([dict(selector=\"th\", props=[(\"font-size\", \"120%\"), (\"text-align\", \"center\")]),\n", " dict(selector='.row_heading, .blank', props=[('display', 'none')])])\n", " .background_gradient(cmap='Greens', subset=['% open', '% digitised'], high=0.5)\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Calculate some aggregate data\n", "\n", "Now let's calculate some aggregate data across the complete set of series harvests." ] }, { "cell_type": "code", "execution_count": 155, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total items: 88,580\n" ] } ], "source": [ "total_items = df['total_items'].sum()\n", "print('Total items: {:,}'.format(total_items))" ] }, { "cell_type": "code", "execution_count": 156, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Earliest date: 1800\n" ] } ], "source": [ "date_from = df['date_from'].min()\n", "print('Earliest date: {}'.format(date_from))" ] }, { "cell_type": "code", "execution_count": 157, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Latest date: 2005\n" ] } ], "source": [ "date_to = df['date_to'].max()\n", "print('Latest date: {}'.format(date_to))" ] }, { "cell_type": "code", "execution_count": 161, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Access status:\n", "\n", " Open 74,732 (84.37%)\n", " OWE 83 (0.09%)\n", " NYE 13,759 (15.53%)\n", " Closed 6 (0.01%)\n" ] } ], "source": [ "access_status = {}\n", "total = df['total_items'].sum()\n", "print('Access status:\\n')\n", "for status in ['Open', 'OWE', 'NYE', 'Closed']:\n", " status_total = df[status].sum()\n", " access_status[status] = status_total\n", " print(' {: <10} {: <10,} ({:.2%})'.format(status, status_total, status_total/total))\n" ] }, { "cell_type": "code", "execution_count": 162, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total files digitised: 21,762\n" ] } ], "source": [ "digitised_files = df['digitised_files'].sum()\n", "print('Total files digitised: {:,}'.format(digitised_files))" ] }, { "cell_type": "code", "execution_count": 163, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total pages digitised: 173,120\n" ] } ], "source": [ "digitised_pages = df['digitised_pages'].sum()\n", "print('Total pages digitised: {:,}'.format(digitised_pages))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }