{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Add dates to pages" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import arrow\n", "import re\n", "from IPython.display import display, HTML\n", "import altair as alt" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df_series = pd.read_csv('series_list.csv')[:-1]\n", "all_holidays = pd.read_csv('nsw_holidays_1900_1950.csv')\n", "#all_holidays.loc[:, 'date'] = pd.to_datetime(all_holidays.loc[:, 'date'], errors='coerce')\n", "\n", "def get_holidays(year):\n", " holidays = all_holidays.loc[all_holidays['year'] == year]['date']\n", " return holidays.to_list()\n", "\n", "def daterange(start_date, end_date):\n", " for n in range(int ((end_date - start_date).days)):\n", " yield start_date.shift(days=+n)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "missing = {\n", " '1901-01-07': 3,\n", " '1901-01-18': 4,\n", " '1901-01-23': 0, # Death of the Queen business abandoned https://trove.nla.gov.au/newspaper/article/14371864/1343690\n", " '1901-02-25': 4,\n", " '1901-03-18': 0,\n", " '1901-03-29': 0, # missing\n", " '1901-04-04': 3, # No afternoon, day before Easter\n", " '1901-04-09': 0, # Extra Easter Tuesday\n", " '1901-04-10': 0, # Extra Easter Wednesday\n", " '1901-05-27': 0, # Holiday Duke of Cornwall visiting\n", " '1901-05-28': 0, # Holiday Duke of Cornwall visiting\n", " '1901-07-03': 0, # Holiday for polling day\n", " '1901-09-16': 4, # No morning\n", " '1901-10-10': 4, # 1 Noon\n", " '1901-10-30': 4, # 1 Noon\n", " '1901-12-16': 2, # Noon only\n", " '1902-02-26': 0, # ??\n", " '1902-04-02': 3, # No afternboon\n", " '1902-06-26': 0, # ??\n", " '1902-08-09': 0, #??\n", " '1902-10-17': 6, # 008_0063 is a duplicate \n", " '1903-01-06': 4, # 1 afternoon missing\n", " '1903-01-09': 4, # morning missing\n", " '1903-04-09': 3, # No afternoon, day before Easter\n", " '1903-04-14': 0, # Easter Tuesday\n", " # 1903-09-02 has no morning, but 3 noons\n", " '1903-09-08': 4, # no morning\n", " # 1903-09-16 has no morning, but 3 noons\n", " '1903-10-01': 3, # no afternoon\n", " '1903-11-18': 3, # no morning, 1 noon -- see 219 and 220!\n", " '1903-11-30': 7, # 2 sheets from 1903-11-18 inserted\n", " '1903-12-16': 0, # ??\n", " '1904-01-20': 3, # no afternoon\n", " '1904-08-15': 3, # no afternoon\n", " '1904-11-09': 6, # 016_145 is a duplicate\n", " '1905-03-02': 6, # 017_213 is a duplicate\n", " '1905-03-08': 6, # 017_239 is a duplicate\n", " '1905-04-20': 3, # No afternoon, day before Easter\n", " '1905-04-25': 0, # Easter Tuesday\n", " '1905-04-26': 0, # Easter Wednesday\n", " '1906-03-19': 6, # extra page, 282 is from 1906-03-21\n", " '1906-03-21': 4, # 1 page included in 1906-03-19\n", " '1906-04-02': 4, # 1 afternoon missing\n", " '1906-04-06': 4, # 1 afternoon missing\n", " '1906-04-09': 4, # 1 afternoon missing\n", " '1906-04-10': 4, # 1 afternoon missing\n", " '1906-04-11': 4, # 1 afternoon missing\n", " '1906-04-12': 3, # No afternoon, day before Easter\n", " '1906-04-17': 0, # Easter Tuesday\n", " '1906-04-18': 0, # Easter Wednesday\n", " '1906-04-25': 4, # 1 afternoon missing\n", " '1906-05-02': 4, # 1 afternoon missing\n", " '1906-05-03': 4, # 1 afternoon missing\n", " '1906-07-12': 4, # 1 afternoon missing\n", " '1906-07-16': 4, # 1 afternoon missing\n", " '1906-10-25': 3, # Afternoon missing\n", " '1907-02-02': 1, # Saturday 1 page only\n", " '1907-03-08': 4, # 1 afternoon missing\n", " '1907-04-29': 4, # 1 afternoon missing\n", " '1907-06-27': 2, # 2 pages only marked '11 o'clock'\n", " '1907-09-10': 3, # No afternoon\n", " '1907-10-11': 4, # 1 afternoon missing\n", " '1907-11-29': 4, # 1 afternoon missing\n", " '1907-12-02': 4, # 1 afternoon missing\n", " '1908-03-12': 4, # 1 afternoon missing\n", " '1908-04-16': 3, # No afternoon, day before Easter\n", " '1908-04-21': 0, # Easter Tuesday\n", " '1908-08-20': 0, # American Fleet visit!\n", " '1908-08-21': 3, # No morning?\n", " '1908-08-24': 0, # American Fleet visit!\n", " '1908-11-14': 1, # Saturday 1 page only\n", " '1929-03-01': 4,\n", " '1929-03-12': 3,\n", " '1929-03-27': 3,\n", " '1930-02-26': 3,\n", " '1930-04-17': 3,\n", " '1930-04-22': 0,\n", " '1930-04-23': 0,\n", " '1930-04-24': 0,\n", " '1930-04-26': 0,\n", " '1930-05-09': 3,\n", " '1930-12-23': 3\n", "}\n", "\n", "duplicates = [\n", " '008_0063',\n", " '016_145',\n", " '017_213',\n", " '119_265'\n", "]\n", "\n", "backwards = [\n", " '120'\n", "]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "pages_per_vol = {\n", " '1_134': {\n", " 'weekday': 5,\n", " 'saturday': 2\n", " },\n", " '135_145': {\n", " 'weekday': 6,\n", " 'saturday': 2\n", " },\n", " '146_164': {\n", " 'weekday': 9,\n", " 'saturday': 3\n", " },\n", " '165_190': {\n", " 'weekday': 6,\n", " 'saturday': 3\n", " },\n", " '191_199': {\n", " 'weekday': 6,\n", " 'saturday': 0\n", " },\n", "}\n", "\n", "def get_pages(vol_num):\n", " for key, pages in pages_per_vol.items():\n", " vols = key.split('_')\n", " vols = [int(y) for y in vols]\n", " if len(vols) == 2:\n", " vols = list(range(vols[0], vols[1] + 1))\n", " if vol_num in vols:\n", " return pages" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def calculate_date(image_name, start_date, end_date, holidays, weekday_pages, saturday_pages):\n", " page_num = re.search(r'N193\\-\\d+_(\\d+)', image_name).group(1)\n", " pages = 0\n", " for single_date in daterange(start_date, end_date):\n", " if single_date.format('YYYY-MM-DD') in missing:\n", " skip = missing[single_date.format('YYYY-MM-DD')]\n", " elif single_date.weekday() == 6 or single_date.format('YYYY-MM-DD') in holidays:\n", " skip = 0\n", " elif single_date.weekday() == 5:\n", " skip = 2\n", " else:\n", " skip = 5\n", " for s in range(1, skip + 1):\n", " pages += 1\n", " if pages == int(page_num):\n", " return single_date\n", " \n", "def calculate_single_date(image_name):\n", " '''\n", " Get the date of a single page.\n", " '''\n", " vol_num, page_num = re.search(r'(N193\\-\\d+)_(\\d+)', image_name).groups()\n", " # df_series = pd.read_csv('series_list.csv').dropna(subset=['Item_number'])\n", " volume = df_series.loc[df_series['Item_number'].str.contains(vol_num)].iloc[0]\n", " start_date = arrow.get(volume['start_date'], 'YYYY-MM-DD')\n", " end_date = arrow.get(volume['end_date'], 'YYYY-MM-DD').shift(days=+1)\n", " year = start_date.year\n", " holidays = sorted(get_holidays(year))\n", " single_date = calculate_date(image_name, start_date, end_date, holidays)\n", " return single_date\n", "\n", "def save_dates(year):\n", " '''\n", " Assign dates to pages from a year.\n", " '''\n", " page_dates = []\n", " df = pd.read_csv(f'{year}.csv')\n", " # df_series = pd.read_csv('series_list.csv').dropna(subset=['Item_number'])\n", " volumes = list(df['referenceCode'].unique())\n", " holidays = get_holidays(year)\n", " for vol_ref, rows in df.groupby(by='referenceCode'):\n", " vol_num = int(vol_ref.split('-')[-1])\n", " vol_pages = get_pages(vol_num)\n", " volume = df_series.loc[df_series['Item_number'].str.contains(vol_ref)].iloc[0]\n", " start_date = arrow.get(volume['start_date'], 'YYYY-MM-DD')\n", " end_date = arrow.get(volume['end_date'], 'YYYY-MM-DD').shift(days=+1)\n", " for row in rows.itertuples():\n", " page_date = calculate_date(row.name, start_date, end_date, holidays, vol_pages['weekday'], vol_pages['saturday'])\n", " page_dates.append({'name': row.name, 'page_date': page_date.format('YYYY-MM-DD')})\n", " df_new = pd.merge(df, pd.DataFrame(page_dates), on='name')\n", " return df_new\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "df_new = save_dates(1901)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | directory | \n", "name | \n", "path | \n", "referenceCode | \n", "startDate | \n", "endDate | \n", "year | \n", "width | \n", "height | \n", "columns | \n", "column_positions | \n", "page_date | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "AU NBAC N193-001/ | \n", "N193-001_0001.tif | \n", "Shared/ANU-Library/Sydney Stock Exchange 1901-... | \n", "N193-001 | \n", "1901-01-01 | \n", "1901-03-01 | \n", "1901 | \n", "6237 | \n", "5000 | \n", "3 | \n", "0,1811,3222 | \n", "1901-01-07 | \n", "
1 | \n", "AU NBAC N193-001/ | \n", "N193-001_0002.tif | \n", "Shared/ANU-Library/Sydney Stock Exchange 1901-... | \n", "N193-001 | \n", "1901-01-01 | \n", "1901-03-01 | \n", "1901 | \n", "6266 | \n", "5000 | \n", "3 | \n", "205,1840,3259 | \n", "1901-01-07 | \n", "
2 | \n", "AU NBAC N193-001/ | \n", "N193-001_0003.tif | \n", "Shared/ANU-Library/Sydney Stock Exchange 1901-... | \n", "N193-001 | \n", "1901-01-01 | \n", "1901-03-01 | \n", "1901 | \n", "6237 | \n", "5000 | \n", "2 | \n", "286,2068 | \n", "1901-01-07 | \n", "
3 | \n", "AU NBAC N193-001/ | \n", "N193-001_0004.tif | \n", "Shared/ANU-Library/Sydney Stock Exchange 1901-... | \n", "N193-001 | \n", "1901-01-01 | \n", "1901-03-01 | \n", "1901 | \n", "6236 | \n", "5000 | \n", "3 | \n", "9,1821,3219 | \n", "1901-01-08 | \n", "
4 | \n", "AU NBAC N193-001/ | \n", "N193-001_0005.tif | \n", "Shared/ANU-Library/Sydney Stock Exchange 1901-... | \n", "N193-001 | \n", "1901-01-01 | \n", "1901-03-01 | \n", "1901 | \n", "6236 | \n", "5000 | \n", "3 | \n", "288,1821,3220 | \n", "1901-01-08 | \n", "