{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Summarise series data\n", "\n", "In this notebook we'll summarise data from all the harvested series." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "" ], "text/vnd.plotly.v1+html": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import os\n", "import pandas as pd\n", "from IPython.core.display import display, HTML\n", "import plotly.offline as py\n", "import plotly.graph_objs as go\n", "py.init_notebook_mode()\n", "import series_details" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# This is a list of all the series harvested as part of this repository\n", "series_list = ['A6119', 'A6122', 'A6126', 'A9626', 'A6335', 'B2836', 'A8703', 'A13828', 'A6281', 'A6285', 'A6283', 'A6282', 'A6126', 'A9106', 'A9108', 'A9105', 'A12694', 'D1902', 'D1915']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Aggregate all the series data into a single dataframe\n", "\n", "Let's combine summaries of all the harvested series into a single dataframe so we can look at the big picture." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Create a list to store the summaries\n", "summaries = []\n", "\n", "# Loop through the list of series in this repo\n", "for series in series_list:\n", " # Open the CSV of each series harvest as a data frame\n", " df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])\n", " # Extract a summary of each series and add it to the list of summaries\n", " summaries.append(series_details.make_summary(series, df, include_titles=False))\n", " \n", "# Convert the list of summaries into a DataFrame for easy manipulation\n", "df = pd.DataFrame(summaries)\n", "\n", "# Flatten the access count dictionaries and fill blanks with zero\n", "df = pd.concat([df, pd.DataFrame((d for idx, d in df['access_counts'].iteritems()))], axis=1).fillna(0)\n", "\n", "# Change access counts from floats to integers\n", "df[['Closed', 'Not yet examined', 'Open with exception', 'Open']] = df[['Closed', 'Not yet examined', 'Open with exception', 'Open']].astype(int)\n", "\n", "# Delete the old 'access_counts' column\n", "del df['access_counts']\n", "\n", "# For convenience acronymise 'Not yet examined' and 'Open with exception'\n", "df.rename({'Not yet examined': 'NYE', 'Open with exception': 'OWE'}, axis=1, inplace=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Display a summary table of all series\n", "\n", "Let's display a summary of each series in a nicely formatted table." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ " \n", "
\n", " | series | \n", "total_items | \n", "date_from | \n", "date_to | \n", "Open | \n", "OWE | \n", "NYE | \n", "Closed | \n", "digitised_files | \n", "digitised_pages | \n", "% open | \n", "% digitised | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "A6119 | \n", "6,741 | \n", "1852 | \n", "2009 | \n", "43 | \n", "6,314 | \n", "363 | \n", "20 | \n", "2,320 | \n", "258,547 | \n", "0.64% | \n", "34.42% | \n", "
1 | \n", "A6122 | \n", "2,819 | \n", "1800 | \n", "1993 | \n", "162 | \n", "2,376 | \n", "137 | \n", "138 | \n", "565 | \n", "69,007 | \n", "5.75% | \n", "20.04% | \n", "
2 | \n", "A6126 | \n", "1,409 | \n", "1800 | \n", "1993 | \n", "83 | \n", "1,306 | \n", "8 | \n", "11 | \n", "364 | \n", "13,521 | \n", "5.89% | \n", "25.83% | \n", "
3 | \n", "A9626 | \n", "1,075 | \n", "1919 | \n", "1998 | \n", "792 | \n", "277 | \n", "6 | \n", "0 | \n", "570 | \n", "9,370 | \n", "73.67% | \n", "53.02% | \n", "
4 | \n", "A6335 | \n", "42 | \n", "1922 | \n", "1956 | \n", "38 | \n", "4 | \n", "0 | \n", "0 | \n", "25 | \n", "2,607 | \n", "90.48% | \n", "59.52% | \n", "
5 | \n", "B2836 | \n", "14 | \n", "1926 | \n", "1972 | \n", "14 | \n", "0 | \n", "0 | \n", "0 | \n", "3 | \n", "375 | \n", "100.00% | \n", "21.43% | \n", "
6 | \n", "A8703 | \n", "641 | \n", "1937 | \n", "1980 | \n", "328 | \n", "0 | \n", "313 | \n", "0 | \n", "0 | \n", "0 | \n", "51.17% | \n", "0.00% | \n", "
7 | \n", "A13828 | \n", "12 | \n", "1955 | \n", "1974 | \n", "3 | \n", "0 | \n", "9 | \n", "0 | \n", "0 | \n", "0 | \n", "25.00% | \n", "0.00% | \n", "
8 | \n", "A6281 | \n", "17 | \n", "0 | \n", "0 | \n", "11 | \n", "1 | \n", "5 | \n", "0 | \n", "0 | \n", "0 | \n", "64.71% | \n", "0.00% | \n", "
9 | \n", "A6285 | \n", "132 | \n", "1954 | \n", "1955 | \n", "83 | \n", "31 | \n", "17 | \n", "0 | \n", "110 | \n", "186 | \n", "62.88% | \n", "83.33% | \n", "
10 | \n", "A6283 | \n", "256 | \n", "1800 | \n", "1959 | \n", "21 | \n", "208 | \n", "24 | \n", "3 | \n", "23 | \n", "3,352 | \n", "8.20% | \n", "8.98% | \n", "
11 | \n", "A6282 | \n", "14 | \n", "1954 | \n", "1956 | \n", "13 | \n", "1 | \n", "0 | \n", "0 | \n", "2 | \n", "328 | \n", "92.86% | \n", "14.29% | \n", "
12 | \n", "A6126 | \n", "1,409 | \n", "1800 | \n", "1993 | \n", "83 | \n", "1,306 | \n", "8 | \n", "11 | \n", "364 | \n", "13,521 | \n", "5.89% | \n", "25.83% | \n", "
13 | \n", "A9106 | \n", "1 | \n", "1968 | \n", "1968 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "100.00% | \n", "0.00% | \n", "
14 | \n", "A9108 | \n", "691 | \n", "1920 | \n", "1967 | \n", "220 | \n", "465 | \n", "2 | \n", "4 | \n", "107 | \n", "9,810 | \n", "31.84% | \n", "15.48% | \n", "
15 | \n", "A9105 | \n", "1 | \n", "1991 | \n", "1991 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "100.00% | \n", "0.00% | \n", "
16 | \n", "A12694 | \n", "25 | \n", "1965 | \n", "1986 | \n", "5 | \n", "20 | \n", "0 | \n", "0 | \n", "8 | \n", "669 | \n", "20.00% | \n", "32.00% | \n", "
17 | \n", "D1902 | \n", "3 | \n", "1920 | \n", "1960 | \n", "3 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "100.00% | \n", "0.00% | \n", "
18 | \n", "D1915 | \n", "4,884 | \n", "1800 | \n", "1987 | \n", "2,703 | \n", "101 | \n", "2,007 | \n", "73 | \n", "203 | \n", "13,917 | \n", "55.34% | \n", "4.16% | \n", "