{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"series = 'J2482'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import os\n",
"import pandas as pd\n",
"import series_details\n",
"import plotly.offline as py\n",
"py.init_notebook_mode()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"
National Archives of Australia: Series J2482
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Certificates of Domicile issued under The Immigration Restriction Act 1901 and Regulations, annual single number series
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Total items | 799 |
---|
Access status | |
---|
Open | 799 (100.00%) |
Number of items digitised | 798 (99.87%) |
---|
Number of pages digitised | 3,153 |
---|
Date of earliest content | 1902 |
---|
Date of latest content | 1912 |
---|
Download the complete CSV file
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"series_details.display_summary(series, df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Content preview"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" identifier | \n",
" series | \n",
" control_symbol | \n",
" title | \n",
" contents_dates | \n",
" start_date | \n",
" end_date | \n",
" access_status | \n",
" location | \n",
" digitised_status | \n",
" digitised_pages | \n",
"
\n",
" \n",
" 0 | \n",
" 5049001 | \n",
" J2482 | \n",
" 1904/103 | \n",
" Sheong Fook of Geraldton [Innisfail], Qld - birthplace: Canton, China - departed Geraldton [Innisfail], Queensland on the Empire 7 September 1904 | \n",
" 1904 - 1904 | \n",
" 1904-01-01 00:00:00 | \n",
" 1904-01-01 00:00:00 | \n",
" Open | \n",
" Brisbane | \n",
" True | \n",
" 4 | \n",
"
\n",
" 1 | \n",
" 5049002 | \n",
" J2482 | \n",
" 1904/104 | \n",
" Ah Gee of Macnade near Dungeness, Qld - birthplace: Canton, China - departed Dungeness, Queensland on the Tsinan 26 June 1904 | \n",
" 1904 - 1904 | \n",
" 1904-01-01 00:00:00 | \n",
" 1904-01-01 00:00:00 | \n",
" Open | \n",
" Brisbane | \n",
" True | \n",
" 3 | \n",
"
\n",
" 2 | \n",
" 5049003 | \n",
" J2482 | \n",
" 1904/105 | \n",
" Ah Yeen of Johnstone near Geraldton [Innisfail] - birthplace: Canton, China - departed Geraldton [Innisfail], Queensland on the Tsinan 25 June 1904 | \n",
" 1904 - 1904 | \n",
" 1904-01-01 00:00:00 | \n",
" 1904-01-01 00:00:00 | \n",
" Open | \n",
" Brisbane | \n",
" True | \n",
" 4 | \n",
"
\n",
" 3 | \n",
" 5049004 | \n",
" J2482 | \n",
" 1904/106 | \n",
" Khardin of Hambleton, Cairns, Qld - birthplace: Punjaub, India - departed Cairns, Queensland 28 August 1908 | \n",
" 1904 - 1908 | \n",
" 1904-01-01 00:00:00 | \n",
" 1908-01-01 00:00:00 | \n",
" Open | \n",
" Brisbane | \n",
" True | \n",
" 5 | \n",
"
\n",
" 4 | \n",
" 5049005 | \n",
" J2482 | \n",
" 1904/108 | \n",
" Yep Fat of Junda, Qld - birthplace: Canton, China - departed Brisbane on the Tsinan 20 June 1904 | \n",
" 1904 - 1904 | \n",
" 1904-01-01 00:00:00 | \n",
" 1904-01-01 00:00:00 | \n",
" Open | \n",
" Brisbane | \n",
" True | \n",
" 4 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Change the number_of_rows value to see more\n",
"number_of_rows = 5\n",
"\n",
"# Display dataframe \n",
"df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector=\"th\", props=[(\"text-align\", \"center\")]),\n",
" dict(selector='.row_heading, .blank', props=[('display', 'none')])])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Plot content dates"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"data": [
{
"name": "Digitised",
"type": "bar",
"x": [
1902,
1903,
1904,
1905,
1906,
1907,
1908,
1909,
1910,
1911,
1912
],
"y": [
53,
164,
272,
351,
113,
9,
7,
2,
1,
1,
1
]
},
{
"name": "Not digitised",
"type": "bar",
"x": [
1905
],
"y": [
1
]
}
],
"layout": {
"barmode": "stack",
"title": "Content dates",
"xaxis": {
"title": "Year"
},
"yaxis": {
"title": "Number of items"
}
}
},
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fig = series_details.plot_dates(df)\n",
"py.iplot(fig, filename='series-dates-bar')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## View word frequencies"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Combine all of the file titles into a single string\n",
"title_text = a = df['title'].str.lower().str.cat(sep=' ')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" word | \n",
" count | \n",
"
\n",
" \n",
" 9 | \n",
" queensland | \n",
" 950 | \n",
"
\n",
" 5 | \n",
" birthplace | \n",
" 745 | \n",
"
\n",
" 8 | \n",
" departed | \n",
" 730 | \n",
"
\n",
" 7 | \n",
" china | \n",
" 722 | \n",
"
\n",
" 6 | \n",
" canton | \n",
" 636 | \n",
"
\n",
" 27 | \n",
" cairns | \n",
" 402 | \n",
"
\n",
" 4 | \n",
" qld | \n",
" 375 | \n",
"
\n",
" 66 | \n",
" 1905 | \n",
" 269 | \n",
"
\n",
" 14 | \n",
" ah | \n",
" 259 | \n",
"
\n",
" 13 | \n",
" 1904 | \n",
" 226 | \n",
"
\n",
" 36 | \n",
" brisbane | \n",
" 207 | \n",
"
\n",
" 46 | \n",
" townsville | \n",
" 173 | \n",
"
\n",
" 126 | \n",
" australian | \n",
" 163 | \n",
"
\n",
" 2 | \n",
" geraldton | \n",
" 156 | \n",
"
\n",
" 99 | \n",
" island | \n",
" 145 | \n",
"
\n",
" 106 | \n",
" thursday | \n",
" 140 | \n",
"
\n",
" 83 | \n",
" eastern | \n",
" 136 | \n",
"
\n",
" 10 | \n",
" empire | \n",
" 130 | \n",
"
\n",
" 635 | \n",
" 1903 | \n",
" 120 | \n",
"
\n",
" 85 | \n",
" november | \n",
" 100 | \n",
"
\n",
" 17 | \n",
" near | \n",
" 98 | \n",
"
\n",
" 58 | \n",
" 1906 | \n",
" 97 | \n",
"
\n",
" 74 | \n",
" december | \n",
" 97 | \n",
"
\n",
" 3 | \n",
" innisfail | \n",
" 96 | \n",
"
\n",
" 206 | \n",
" january | \n",
" 78 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"series_details.display_word_counts(title_text)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" ngram | \n",
" count | \n",
"
\n",
" \n",
" 0 | \n",
" china departed | \n",
" 690 | \n",
"
\n",
" 1 | \n",
" on the | \n",
" 678 | \n",
"
\n",
" 2 | \n",
" canton china | \n",
" 636 | \n",
"
\n",
" 3 | \n",
" birthplace canton | \n",
" 635 | \n",
"
\n",
" 4 | \n",
" queensland on | \n",
" 587 | \n",
"
\n",
" 5 | \n",
" qld birthplace | \n",
" 375 | \n",
"
\n",
" 6 | \n",
" queensland birthplace | \n",
" 318 | \n",
"
\n",
" 7 | \n",
" cairns queensland | \n",
" 258 | \n",
"
\n",
" 8 | \n",
" departed cairns | \n",
" 221 | \n",
"
\n",
" 9 | \n",
" the australian | \n",
" 163 | \n",
"
\n",
" 10 | \n",
" brisbane queensland | \n",
" 160 | \n",
"
\n",
" 11 | \n",
" townsville queensland | \n",
" 141 | \n",
"
\n",
" 12 | \n",
" thursday island | \n",
" 140 | \n",
"
\n",
" 13 | \n",
" the eastern | \n",
" 136 | \n",
"
\n",
" 14 | \n",
" departed brisbane | \n",
" 131 | \n",
"
\n",
" 15 | \n",
" the empire | \n",
" 130 | \n",
"
\n",
" 16 | \n",
" cairns qld | \n",
" 109 | \n",
"
\n",
" 17 | \n",
" departed townsville | \n",
" 108 | \n",
"
\n",
" 18 | \n",
" geraldton innisfail | \n",
" 96 | \n",
"
\n",
" 19 | \n",
" of cairns | \n",
" 82 | \n",
"
\n",
" 20 | \n",
" of geraldton | \n",
" 82 | \n",
"
\n",
" 21 | \n",
" near cairns | \n",
" 81 | \n",
"
\n",
" 22 | \n",
" departed geraldton | \n",
" 73 | \n",
"
\n",
" 23 | \n",
" departed thursday | \n",
" 72 | \n",
"
\n",
" 24 | \n",
" 1905 ah | \n",
" 68 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Change ngram_count for larger ngrams (trigrams etc)\n",
"ngram_count = 2\n",
"series_details.display_top_ngrams(title_text, ngram_count)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}