{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Visualise column detection results\n", "\n", "I ran my column detection script across all 199 volumes and 70,000+ pages of the Sydney Stock Exchange registers, generating a CSV file for each year from 1901 to 1950. See [this notebook](Column-detection-results.ipynb) for some more details\n", "\n", "This notebook combines the separate CSV files into a single dataframe and then visualises the column detection results using Altair." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DataTransformerRegistry.enable('json')" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "import os\n", "import altair as alt\n", "from IPython.display import display, HTML\n", "\n", "# Comment the next line out if using in JupyterLab\n", "alt.renderers.enable('notebook')\n", "\n", "alt.data_transformers.enable('json')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# We're going to combine all of the CSV files into one big dataframe\n", "\n", "# Create an empty dataframe\n", "combined_df = pd.DataFrame()\n", "\n", "# Loop through the range of years\n", "for year in range(1901, 1951):\n", " \n", " # Open the CSV file for that year as a dataframe\n", " year_df = pd.read_csv('{}.csv'.format(year))\n", " \n", " # Add the single year df to the combined df\n", " combined_df = combined_df.append(year_df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To visualise the results, we first need to move the data around a bit to get it into a form we can chart using Altair.\n", "\n", "Because this loops through dataframes for each volume calculating column widths, it is a bit slow..." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Replace any NaNs with 0\n", "combined_df['column_positions'].fillna('0', inplace=True)\n", "\n", "# Group images by volume using the 'referenceCode' column\n", "vol_groups = combined_df.groupby('referenceCode')\n", "\n", "# Create an empty dataframe\n", "df_volumes = pd.DataFrame()\n", "\n", "# Just used to limit the number of volumes processed for testing\n", "max_rows = 200\n", "row = 0\n", "\n", "# Loop through the volume groups\n", "for vol, pages in vol_groups:\n", " if row < max_rows:\n", " \n", " # Pages is a dataframe with details of all the pages in a volume\n", " pages = pages.copy()\n", " \n", " # Convert the width to a string, so we concatenate with the column positions\n", " pages['width'] = pages['width'].astype('str')\n", " \n", " # Add the page width to the end of the column positions string\n", " pages['column_positions'] = pages[['column_positions', 'width']].apply(lambda x: ','.join(x), axis=1)\n", " \n", " # Create a new dataframe by exploding the column positions into separate rows, using the file name as the index\n", " new_df = pd.DataFrame(pages['column_positions'].str.split(',').tolist(), index=pages['name']).stack()\n", " \n", " # Change the file name from an index into a column\n", " new_df = new_df.reset_index([0, 'name'])\n", " \n", " # Rename the columns\n", " new_df.columns = ['name', 'col_pos']\n", " \n", " # Convert col_pos column to an integer\n", " new_df['col_pos'] = new_df['col_pos'].astype('int')\n", " \n", " # Extract the page number from the file name and save as a new field\n", " new_df['page_num'] = new_df['name'].str.extract(r'_(\\d+)\\.').astype('int')\n", " \n", " # Add the volume name\n", " new_df['volume'] = vol\n", " \n", " # Add the volume number\n", " new_df['volume_num'] = int(vol[-3:])\n", " \n", " # Sort pages by the volume number / page number\n", " new_df = new_df.sort_values(by=['volume_num','page_num'])\n", " \n", " # Number the columns in each row\n", " new_df['order'] = new_df.groupby('page_num').cumcount()\n", " \n", " # In this loop we're going to calculate the width of each column by subtracting\n", " # the previous col_pos from the current one\n", " # Loop through the dataframe\n", " for i in range(1, len(new_df)):\n", " # if it's the first column then the width is equal to the position\n", " if new_df.loc[i, 'order'] == 0:\n", " new_df.loc[i, 'col_width'] = new_df.loc[i, 'col_pos']\n", " \n", " # Otherwise we have to get the previous value and subtract\n", " else:\n", " new_df.loc[i, 'col_width'] = new_df.loc[i, 'col_pos'] - new_df.loc[i-1, 'col_pos']\n", " \n", " # Append to the combined volumes dataframe\n", " df_volumes = df_volumes.append(new_df)\n", " row += 1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's have a peek inside the new dataframe." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | name | \n", "col_pos | \n", "page_num | \n", "volume | \n", "volume_num | \n", "order | \n", "col_width | \n", "
---|---|---|---|---|---|---|---|
0 | \n", "N193-001_0001.tif | \n", "0 | \n", "1 | \n", "N193-001 | \n", "1 | \n", "0 | \n", "NaN | \n", "
1 | \n", "N193-001_0001.tif | \n", "1811 | \n", "1 | \n", "N193-001 | \n", "1 | \n", "1 | \n", "1811.0 | \n", "
2 | \n", "N193-001_0001.tif | \n", "3222 | \n", "1 | \n", "N193-001 | \n", "1 | \n", "2 | \n", "1411.0 | \n", "
3 | \n", "N193-001_0001.tif | \n", "6237 | \n", "1 | \n", "N193-001 | \n", "1 | \n", "3 | \n", "3015.0 | \n", "
4 | \n", "N193-001_0002.tif | \n", "205 | \n", "2 | \n", "N193-001 | \n", "1 | \n", "0 | \n", "205.0 | \n", "