{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Summarise series data\n", "\n", "In this notebook we'll summarise data from all the harvested series." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/vnd.plotly.v1+html": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import os\n", "import pandas as pd\n", "from IPython.core.display import display, HTML\n", "import series_details" ] }, { "cell_type": "code", "execution_count": 152, "metadata": {}, "outputs": [], "source": [ "# This is a list of all the series harvested as part of this repository\n", "series_list = ['B13', 'B6003', 'BP343/15', 'D2860', 'D5036', 'D596', 'E752', 'J2481', 'J2482', 'J2483', 'J3115', 'K1145', 'P437', 'P526', 'PP4/2', 'PP6/1', 'SP11/26', 'SP11/6', 'SP115/1', 'SP115/10', 'SP42/1', 'SP726/1', 'ST84/1']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Aggregate all the series data into a single dataframe\n", "\n", "Let's combine summaries of all the harvested series into a single dataframe so we can look at the big picture." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a list to store the summaries\n", "summaries = []\n", "\n", "# Loop through the list of series in this repo\n", "for series in series_list:\n", " # Open the CSV of each series harvest as a data frame\n", " df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])\n", " # Extract a summary of each series and add it to the list of summaries\n", " summaries.append(series_details.make_summary(series, df, include_titles=False))\n", " \n", "# Convert the list of summaries into a DataFrame for easy manipulation\n", "df = pd.DataFrame(summaries)\n", "\n", "# Flatten the access count dictionaries and fill blanks with zero\n", "df = pd.concat([df, pd.DataFrame((d for idx, d in df['access_counts'].iteritems()))], axis=1).fillna(0)\n", "\n", "# Change access counts from floats to integers\n", "df[['Closed', 'Not yet examined', 'Open with exception', 'Open']] = df[['Closed', 'Not yet examined', 'Open with exception', 'Open']].astype(int)\n", "\n", "# Delete the old 'access_counts' column\n", "del df['access_counts']\n", "\n", "# For convenience acronymise 'Not yet examined' and 'Open with exception'\n", "df.rename({'Not yet examined': 'NYE', 'Open with exception': 'OWE'}, axis=1, inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Display a summary table of all series\n", "\n", "Let's display a summary of each series in a nicely formatted table." ] }, { "cell_type": "code", "execution_count": 153, "metadata": {}, "outputs": [ { "data": { "text/html": [ " \n", "
\n", " | series | \n", "total_items | \n", "date_from | \n", "date_to | \n", "Open | \n", "OWE | \n", "NYE | \n", "Closed | \n", "digitised_files | \n", "digitised_pages | \n", "% open | \n", "% digitised | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "B13 | \n", "20,194 | \n", "1800 | \n", "2005 | \n", "19,786 | \n", "8 | \n", "400 | \n", "0 | \n", "354 | \n", "5,043 | \n", "97.98% | \n", "1.75% | \n", "
1 | \n", "B6003 | \n", "3 | \n", "1904 | \n", "1959 | \n", "3 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "100.00% | \n", "0.00% | \n", "
2 | \n", "BP343/15 | \n", "2,571 | \n", "1916 | \n", "1955 | \n", "2,566 | \n", "0 | \n", "5 | \n", "0 | \n", "85 | \n", "176 | \n", "99.81% | \n", "3.31% | \n", "
3 | \n", "D2860 | \n", "1 | \n", "1902 | \n", "1957 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0.00% | \n", "0.00% | \n", "
4 | \n", "D5036 | \n", "1 | \n", "1906 | \n", "1935 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "100.00% | \n", "0.00% | \n", "
5 | \n", "D596 | \n", "11,395 | \n", "1871 | \n", "1971 | \n", "2,983 | \n", "31 | \n", "8,381 | \n", "0 | \n", "185 | \n", "3,031 | \n", "26.18% | \n", "1.62% | \n", "
6 | \n", "E752 | \n", "722 | \n", "1905 | \n", "1941 | \n", "719 | \n", "0 | \n", "3 | \n", "0 | \n", "717 | \n", "9,310 | \n", "99.58% | \n", "99.31% | \n", "
7 | \n", "J2481 | \n", "858 | \n", "1897 | \n", "1903 | \n", "858 | \n", "0 | \n", "0 | \n", "0 | \n", "858 | \n", "2,031 | \n", "100.00% | \n", "100.00% | \n", "
8 | \n", "J2482 | \n", "799 | \n", "1902 | \n", "1912 | \n", "799 | \n", "0 | \n", "0 | \n", "0 | \n", "798 | \n", "3,153 | \n", "100.00% | \n", "99.87% | \n", "
9 | \n", "J2483 | \n", "14,438 | \n", "1903 | \n", "1956 | \n", "14,436 | \n", "0 | \n", "2 | \n", "0 | \n", "14,436 | \n", "79,210 | \n", "99.99% | \n", "99.99% | \n", "
10 | \n", "J3115 | \n", "161 | \n", "1899 | \n", "1928 | \n", "161 | \n", "0 | \n", "0 | \n", "0 | \n", "161 | \n", "1,344 | \n", "100.00% | \n", "100.00% | \n", "
11 | \n", "K1145 | \n", "4,816 | \n", "1900 | \n", "1955 | \n", "4,791 | \n", "0 | \n", "25 | \n", "0 | \n", "175 | \n", "874 | \n", "99.48% | \n", "3.63% | \n", "
12 | \n", "P437 | \n", "4,958 | \n", "1901 | \n", "1940 | \n", "4,945 | \n", "10 | \n", "2 | \n", "1 | \n", "18 | \n", "442 | \n", "99.74% | \n", "0.36% | \n", "
13 | \n", "P526 | \n", "2 | \n", "1909 | \n", "1918 | \n", "1 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "50.00% | \n", "0.00% | \n", "
14 | \n", "PP4/2 | \n", "613 | \n", "1903 | \n", "1947 | \n", "610 | \n", "0 | \n", "3 | \n", "0 | \n", "28 | \n", "1,512 | \n", "99.51% | \n", "4.57% | \n", "
15 | \n", "PP6/1 | \n", "6,010 | \n", "1906 | \n", "1978 | \n", "1,863 | \n", "33 | \n", "4,109 | \n", "5 | \n", "245 | \n", "6,461 | \n", "31.00% | \n", "4.08% | \n", "
16 | \n", "SP11/26 | \n", "27 | \n", "1902 | \n", "1902 | \n", "27 | \n", "0 | \n", "0 | \n", "0 | \n", "5 | \n", "84 | \n", "100.00% | \n", "18.52% | \n", "
17 | \n", "SP11/6 | \n", "191 | \n", "1902 | \n", "1947 | \n", "101 | \n", "0 | \n", "90 | \n", "0 | \n", "1 | \n", "323 | \n", "52.88% | \n", "0.52% | \n", "
18 | \n", "SP115/1 | \n", "1,787 | \n", "1884 | \n", "1943 | \n", "1,787 | \n", "0 | \n", "0 | \n", "0 | \n", "9 | \n", "285 | \n", "100.00% | \n", "0.50% | \n", "
19 | \n", "SP115/10 | \n", "6 | \n", "1884 | \n", "1888 | \n", "6 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "100.00% | \n", "0.00% | \n", "
20 | \n", "SP42/1 | \n", "16,256 | \n", "1881 | \n", "1960 | \n", "15,525 | \n", "0 | \n", "731 | \n", "0 | \n", "3,253 | \n", "45,862 | \n", "95.50% | \n", "20.01% | \n", "
21 | \n", "SP726/1 | \n", "6 | \n", "1902 | \n", "1959 | \n", "6 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "100.00% | \n", "0.00% | \n", "
22 | \n", "ST84/1 | \n", "2,765 | \n", "1855 | \n", "1975 | \n", "2,758 | \n", "0 | \n", "7 | \n", "0 | \n", "434 | \n", "13,979 | \n", "99.75% | \n", "15.70% | \n", "