# Comparing harvests of closed files

This notebook brings together annual harvests of files with an access status of 'closed', scraped from the NAA's RecordSearch database. The data files are here:

* [2015](data/closed-20160101.csv) (harvested 1 January 2016)
* [2016](data/closed-20170109.csv) (harvested 9 January 2017)
* [2017](data/closed-20180101.csv) (harvested 1 January 2018)
* [2018](data/closed-20190101.csv) (harvested 1 January 2019)
* [2019](data/closed-20200101.csv) (harvested 1 January 2020)
* [2020](data/closed-20210101.csv) (harvested 1 January 2021)

The current code used to harvest 'closed' files is in [this notebook](harvest_closed_files.ipynb). Previous versions can be found in [this repository](https://github.com/wragge/closed_access).

In [11]:
from pathlib import Path

import altair as alt
import pandas as pd

In [12]:
harvests = {
    "2015": "closed-20160101.csv",
    "2016": "closed-20170109.csv",
    "2017": "closed-20180101.csv",
    "2018": "closed-20190101.csv",
    "2019": "closed-20200101.csv",
    "2020": "closed-20210101.csv",
    "2021": "closed-20220101.csv",
}

In [13]:
# Load all the data into a single dataframe
dfs = []
for year, data_file in harvests.items():
    df_year = pd.read_csv(
        Path("data", data_file),
        parse_dates=[
            "contents_start_date",
            "contents_end_date",
            "access_decision_date",
        ],
        keep_default_na=False,
    )
    df_year["harvested_year"] = year
    dfs.append(df_year)
df = pd.concat(dfs)
df.head()

Unnamed: 0,identifier,series,control_symbol,title,series_title,contents_date_str,contents_start_date,contents_end_date,access_status,access_decision_date,reasons,harvested_year,location,access_decision_date_str,digitised_status,digitised_pages,access_decision_reasons,retrieved
0,12332,A1,1911/21007,Salvatore Pagano Naturalization Issued to Immi...,"Correspondence files, annual single number ser...",1961 - 1961,1961-01-01 00:00:00,1961-01-01 00:00:00,Closed,1981-07-28 00:00:00,Pre Access Recorder,2015,,,,,,
1,15403,A1,1913/6809,Meeting of Commonwealth Literary Fund (Missing...,"Correspondence files, annual single number ser...",1913 - 1913,1913-01-01 00:00:00,1913-01-01 00:00:00,Closed,1981-09-28 00:00:00,Pre Access Recorder,2015,,,,,,
2,33093,A1,1915/11532,Wilhelm CA Simonsen - Naturalization Issued to...,"Correspondence files, annual single number ser...",1961 - 1961,1961-01-01 00:00:00,1961-01-01 00:00:00,Closed,1981-12-03 00:00:00,Pre Access Recorder,2015,,,,,,
3,46663,A2,1907/554,Report of Conference of Statisticians (File Co...,"Correspondence files, annual single number series",1904 - 1920,1904-01-01 00:00:00,1920-01-01 00:00:00,Closed,1973-06-20 00:00:00,Pre Access Recorder,2015,,,,,,
4,47046,A2,1915/346,Rossino - Mario,"Correspondence files, annual single number series",1915 - 1915,1915-01-01 00:00:00,1915-01-01 00:00:00,Closed,1973-06-28 00:00:00,Pre Access Recorder,2015,,,,,,


## Number of closed files in each harvest

In [14]:
year_counts = df["harvested_year"].value_counts().to_frame().reset_index()
year_counts.columns = ["year", "count"]
year_counts.sort_values(by="year")

Unnamed: 0,year,count
0,2015,14370
6,2016,10750
4,2017,11189
1,2018,11953
2,2019,11867
5,2020,11140
3,2021,11377


In [15]:
alt.Chart(year_counts).mark_bar(point=True).encode(
    x=alt.X("year:O", title="Year end"),
    y=alt.Y("count:Q", title="Number of closed files"),
    color=alt.Color("year", legend=None),
    tooltip=["year:O", "count:Q"],
).properties(width=300)

## Find the number of times each reason is cited in the annual harvests

In [16]:
df_reasons = df.copy()
df_reasons["reason"] = df_reasons["reasons"].str.split("|")
df_reasons = df_reasons.explode("reason")
df_reasons["reason"].replace("", "No reason", inplace=True)

In [17]:
unique_reasons = sorted(list(df_reasons["reason"].unique()))
unique_reasons

['33(1)(a)',
 '33(1)(b)',
 '33(1)(c)',
 '33(1)(d)',
 '33(1)(e)(i)',
 '33(1)(e)(ii)',
 '33(1)(e)(iii)',
 '33(1)(f)(i)',
 '33(1)(f)(ii)',
 '33(1)(f)(iii)',
 '33(1)(g)',
 '33(1)(h)',
 '33(1)(j)',
 '33(2)(a)',
 '33(2)(b)',
 '33(3)(a)(i)',
 '33(3)(a)(ii)',
 '33(3)(b)',
 'Cabinet notebooks',
 'Closed period',
 'Court records',
 'Destroyed',
 'MAKE YOUR SELECTION',
 'NRF',
 'No reason',
 'Non Cwlth-depositor',
 'Non Cwlth-no appeal',
 'Parliament Class A',
 'Pre Access Recorder',
 'Withheld pending adv',
 "['33(1)(a)', '33(1)(b)', '33(1)(c)', 'Withheld pending adv']",
 "['33(1)(a)', '33(1)(b)', '33(1)(d)', '33(1)(g)', 'Withheld pending adv']",
 "['33(1)(a)', '33(1)(b)', '33(1)(d)', '33(1)(g)']",
 "['33(1)(a)', '33(1)(b)', '33(1)(d)', 'Withheld pending adv']",
 "['33(1)(a)', '33(1)(b)', '33(1)(d)']",
 "['33(1)(a)', '33(1)(b)', '33(1)(e)(ii)', '33(1)(g)']",
 "['33(1)(a)', '33(1)(b)', '33(1)(e)(ii)']",
 "['33(1)(a)', '33(1)(b)', '33(1)(e)(iii)']",
 "['33(1)(a)', '33(1)(b)', '33(1)(g)', 'Withheld

In [18]:
harvest_reasons_counts = (
    df_reasons.groupby(by=["harvested_year", "reason"]).size().reset_index()
)
harvest_reasons_counts.columns = ["year", "reason", "count"]

## Visualise the number of times each reason is cited

In [19]:
alt.Chart(harvest_reasons_counts).mark_bar().encode(
    x=alt.X("year:O", title=None),
    y=alt.Y("count:Q", title="Number of files"),
    color=alt.Color("year:N", legend=None),
    facet=alt.Facet(
        "reason:O", align="each", columns=5, title="Reason for being closed"
    ),
    tooltip=["year:O", "reason:N", "count:Q"],
).properties(height=200).resolve_scale(x="independent")

## Focus on a specific reason

Select a reason from the dropdown list to examine change over time.

In [20]:
input_dropdown = alt.binding_select(
    options=[None] + unique_reasons, labels=["All"] + unique_reasons
)
selection = alt.selection_single(fields=["reason"], bind=input_dropdown, name="Select")

alt.Chart(harvest_reasons_counts).mark_bar().encode(
    x=alt.X("year:O", title=None),
    y=alt.Y("count:Q", title="Number of files"),
    color=alt.Color("year:N", legend=None),
    column=alt.Column("reason:N", title="Reason for being closed"),
    tooltip=["year:O", "reason:N", "count:Q"],
).add_selection(selection).transform_filter(selection).properties(
    height=200
).resolve_scale(
    x="independent"
)