{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"series = 'J3115'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import os\n",
"import pandas as pd\n",
"import series_details\n",
"import plotly.offline as py\n",
"py.init_notebook_mode()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"
National Archives of Australia: Series J3115
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Alien Immigration files relating to applications for Certificate of Domicile, Certificates of Exemption from the Chinese Immigration Restriction Act 1888 and Certificates of Exemption from the Dictation Test that includes photographs, birth certificates and other historical documents, imposed single number series
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Total items | 161 |
---|
Access status | |
---|
Open | 161 (100.00%) |
Number of items digitised | 161 (100.00%) |
---|
Number of pages digitised | 1,344 |
---|
Date of earliest content | 1899 |
---|
Date of latest content | 1928 |
---|
Download the complete CSV file
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"series_details.display_summary(series, df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Content preview"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" identifier | \n",
" series | \n",
" control_symbol | \n",
" title | \n",
" contents_dates | \n",
" start_date | \n",
" end_date | \n",
" access_status | \n",
" location | \n",
" digitised_status | \n",
" digitised_pages | \n",
"
\n",
" \n",
" 0 | \n",
" 5058001 | \n",
" J3115 | \n",
" 1 | \n",
" Certificate of Domicile for Mah Wah, a market gardener from Bundaberg - includes photographs | \n",
" 1902 - 1902 | \n",
" 1902-01-01 00:00:00 | \n",
" 1902-01-01 00:00:00 | \n",
" Open | \n",
" Brisbane | \n",
" True | \n",
" 1 | \n",
"
\n",
" 1 | \n",
" 5058002 | \n",
" J3115 | \n",
" 2 | \n",
" Certificate of Domicile for Sui Tim, a fruiterer and general merchant from Brisbane - includes photographs | \n",
" 1902 - 1903 | \n",
" 1902-01-01 00:00:00 | \n",
" 1903-01-01 00:00:00 | \n",
" Open | \n",
" Brisbane | \n",
" True | \n",
" 1 | \n",
"
\n",
" 2 | \n",
" 5058003 | \n",
" J3115 | \n",
" 50 | \n",
" Certificate of Domicile for Charlie Jock, a storekeeper from Clermont - includes photographs | \n",
" 1903 - 1905 | \n",
" 1903-01-01 00:00:00 | \n",
" 1905-01-01 00:00:00 | \n",
" Open | \n",
" Brisbane | \n",
" True | \n",
" 2 | \n",
"
\n",
" 3 | \n",
" 5058004 | \n",
" J3115 | \n",
" 3 | \n",
" Certificate of Domicile for Tommy Young Hopp, a cook from Brisbane - includes photographs | \n",
" 1902 - 1903 | \n",
" 1902-01-01 00:00:00 | \n",
" 1903-01-01 00:00:00 | \n",
" Open | \n",
" Brisbane | \n",
" True | \n",
" 1 | \n",
"
\n",
" 4 | \n",
" 5058005 | \n",
" J3115 | \n",
" 4 | \n",
" Certificate of Domicile for Jong Hee, the owner of a paper bag factory from Brisbane - includes photographs | \n",
" 1902 - 1904 | \n",
" 1902-01-01 00:00:00 | \n",
" 1904-01-01 00:00:00 | \n",
" Open | \n",
" Brisbane | \n",
" True | \n",
" 1 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Change the number_of_rows value to see more\n",
"number_of_rows = 5\n",
"\n",
"# Display dataframe \n",
"df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector=\"th\", props=[(\"text-align\", \"center\")]),\n",
" dict(selector='.row_heading, .blank', props=[('display', 'none')])])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Plot content dates"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"data": [
{
"name": "Digitised",
"type": "bar",
"x": [
1899,
1900,
1901,
1902,
1903,
1904,
1905,
1906,
1907,
1908,
1909,
1910,
1911,
1912,
1913,
1914,
1915,
1916,
1917,
1918,
1919,
1920,
1921,
1922,
1923,
1924,
1925,
1926,
1927,
1928
],
"y": [
5,
14,
23,
92,
77,
39,
12,
5,
7,
7,
6,
7,
9,
9,
9,
8,
8,
8,
8,
9,
10,
7,
4,
3,
3,
1,
1,
1,
1,
1
]
}
],
"layout": {
"barmode": "stack",
"title": "Content dates",
"xaxis": {
"title": "Year"
},
"yaxis": {
"title": "Number of items"
}
}
},
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fig = series_details.plot_dates(df)\n",
"py.iplot(fig, filename='series-dates-bar')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## View word frequencies"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Combine all of the file titles into a single string\n",
"title_text = a = df['title'].str.lower().str.cat(sep=' ')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" word | \n",
" count | \n",
"
\n",
" \n",
" 0 | \n",
" certificate | \n",
" 168 | \n",
"
\n",
" 7 | \n",
" includes | \n",
" 139 | \n",
"
\n",
" 1 | \n",
" domicile | \n",
" 109 | \n",
"
\n",
" 8 | \n",
" photographs | \n",
" 100 | \n",
"
\n",
" 45 | \n",
" ah | \n",
" 74 | \n",
"
\n",
" 106 | \n",
" application | \n",
" 55 | \n",
"
\n",
" 181 | \n",
" photograph | \n",
" 46 | \n",
"
\n",
" 104 | \n",
" correspondence | \n",
" 43 | \n",
"
\n",
" 17 | \n",
" storekeeper | \n",
" 41 | \n",
"
\n",
" 178 | \n",
" exemption | \n",
" 40 | \n",
"
\n",
" 5 | \n",
" gardener | \n",
" 36 | \n",
"
\n",
" 105 | \n",
" relating | \n",
" 35 | \n",
"
\n",
" 14 | \n",
" brisbane | \n",
" 17 | \n",
"
\n",
" 158 | \n",
" birth | \n",
" 15 | \n",
"
\n",
" 391 | \n",
" pages | \n",
" 14 | \n",
"
\n",
" 174 | \n",
" born | \n",
" 13 | \n",
"
\n",
" 194 | \n",
" queensland | \n",
" 13 | \n",
"
\n",
" 39 | \n",
" island | \n",
" 13 | \n",
"
\n",
" 44 | \n",
" townsville | \n",
" 13 | \n",
"
\n",
" 115 | \n",
" also | \n",
" 13 | \n",
"
\n",
" 13 | \n",
" merchant | \n",
" 12 | \n",
"
\n",
" 38 | \n",
" thursday | \n",
" 12 | \n",
"
\n",
" 154 | \n",
" father | \n",
" 12 | \n",
"
\n",
" 48 | \n",
" lee | \n",
" 11 | \n",
"
\n",
" 65 | \n",
" hing | \n",
" 11 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"series_details.display_word_counts(title_text)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" ngram | \n",
" count | \n",
"
\n",
" \n",
" 0 | \n",
" certificate of | \n",
" 150 | \n",
"
\n",
" 1 | \n",
" of domicile | \n",
" 109 | \n",
"
\n",
" 2 | \n",
" includes photographs | \n",
" 94 | \n",
"
\n",
" 3 | \n",
" domicile for | \n",
" 93 | \n",
"
\n",
" 4 | \n",
" photographs certificate | \n",
" 51 | \n",
"
\n",
" 5 | \n",
" application for | \n",
" 48 | \n",
"
\n",
" 6 | \n",
" of exemption | \n",
" 40 | \n",
"
\n",
" 7 | \n",
" for certificate | \n",
" 38 | \n",
"
\n",
" 8 | \n",
" includes photograph | \n",
" 36 | \n",
"
\n",
" 9 | \n",
" relating to | \n",
" 35 | \n",
"
\n",
" 10 | \n",
" for ah | \n",
" 35 | \n",
"
\n",
" 11 | \n",
" a storekeeper | \n",
" 33 | \n",
"
\n",
" 12 | \n",
" correspondence relating | \n",
" 33 | \n",
"
\n",
" 13 | \n",
" storekeeper from | \n",
" 29 | \n",
"
\n",
" 14 | \n",
" exemption for | \n",
" 23 | \n",
"
\n",
" 15 | \n",
" a gardener | \n",
" 23 | \n",
"
\n",
" 16 | \n",
" gardener from | \n",
" 22 | \n",
"
\n",
" 17 | \n",
" to the | \n",
" 21 | \n",
"
\n",
" 18 | \n",
" photographs application | \n",
" 15 | \n",
"
\n",
" 19 | \n",
" a certificate | \n",
" 15 | \n",
"
\n",
" 20 | \n",
" photograph certificate | \n",
" 15 | \n",
"
\n",
" 21 | \n",
" for a | \n",
" 14 | \n",
"
\n",
" 22 | \n",
" birth certificate | \n",
" 14 | \n",
"
\n",
" 23 | \n",
" from the | \n",
" 13 | \n",
"
\n",
" 24 | \n",
" and correspondence | \n",
" 13 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Change ngram_count for larger ngrams (trigrams etc)\n",
"ngram_count = 2\n",
"series_details.display_top_ngrams(title_text, ngram_count)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}