{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"series = 'BP343/15'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import os\n",
"import pandas as pd\n",
"import series_details\n",
"import plotly.offline as py\n",
"py.init_notebook_mode()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"
National Archives of Australia: Series BP343/15
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Registers of aliens departing from the Port of Townsville who were granted a certificate exempting from dictation test [CEDT]
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Total items | 2,571 |
---|
Access status | |
---|
Open | 2,566 (99.81%) |
Not yet examined | 5 (0.19%) |
Number of items digitised | 85 (3.31%) |
---|
Number of pages digitised | 176 |
---|
Date of earliest content | 1916 |
---|
Date of latest content | 1955 |
---|
Download the complete CSV file
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"series_details.display_summary(series, df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Content preview"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" identifier | \n",
" series | \n",
" control_symbol | \n",
" title | \n",
" contents_dates | \n",
" start_date | \n",
" end_date | \n",
" access_status | \n",
" location | \n",
" digitised_status | \n",
" digitised_pages | \n",
"
\n",
" \n",
" 0 | \n",
" 9103820 | \n",
" BP343/15 | \n",
" 14/1013 | \n",
" Name: Lum Yee - Nationality: Chinese - Birthplace: Canton - Certificate of Exemption from the Dictation Test (CEDT) number: 466/21 | \n",
" 1929 - 1932 | \n",
" 1929-01-01 00:00:00 | \n",
" 1932-01-01 00:00:00 | \n",
" Open | \n",
" Brisbane | \n",
" False | \n",
" 0 | \n",
"
\n",
" 1 | \n",
" 9108210 | \n",
" BP343/15 | \n",
" 13/824 | \n",
" Name: Hoo Wah (of Townsville) - Nationality: Chinese - Birthplace: Canton - Certificate of Exemption from the Dictation Test (CEDT) number: 439/23 | \n",
" 1928 - 1929 | \n",
" 1928-01-01 00:00:00 | \n",
" 1929-01-01 00:00:00 | \n",
" Open | \n",
" Brisbane | \n",
" False | \n",
" 0 | \n",
"
\n",
" 2 | \n",
" 9108211 | \n",
" BP343/15 | \n",
" 13/823 | \n",
" Name: Ah Cow (of Charters Towers) - Nationality: Chinese - Birthplace: Canton - Certificate of Exemption from the Dictation Test (CEDT) number: 439/19 | \n",
" 1928 - 1928 | \n",
" 1928-01-01 00:00:00 | \n",
" 1928-01-01 00:00:00 | \n",
" Open | \n",
" Brisbane | \n",
" False | \n",
" 0 | \n",
"
\n",
" 3 | \n",
" 9108212 | \n",
" BP343/15 | \n",
" 13/822 | \n",
" Name: Bon Kan [Bu Conn] (of Townsville) - Nationality: Chinese - Birthplace: Canton - Certificate of Exemption from the Dictation Test (CEDT) number: 439/28 | \n",
" 1928 - 1928 | \n",
" 1928-01-01 00:00:00 | \n",
" 1928-01-01 00:00:00 | \n",
" Open | \n",
" Brisbane | \n",
" False | \n",
" 0 | \n",
"
\n",
" 4 | \n",
" 9108213 | \n",
" BP343/15 | \n",
" 13/821 | \n",
" Name: Ah Hat - Nationality: Chinese - Birthplace: Canton - Certificate of Exemption from the Dictation Test (CEDT) number: 439/17 | \n",
" 1928 - 1928 | \n",
" 1928-01-01 00:00:00 | \n",
" 1928-01-01 00:00:00 | \n",
" Open | \n",
" Brisbane | \n",
" False | \n",
" 0 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Change the number_of_rows value to see more\n",
"number_of_rows = 5\n",
"\n",
"# Display dataframe \n",
"df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector=\"th\", props=[(\"text-align\", \"center\")]),\n",
" dict(selector='.row_heading, .blank', props=[('display', 'none')])])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Plot content dates"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"data": [
{
"name": "Digitised",
"type": "bar",
"x": [
1916,
1917,
1918,
1919,
1920,
1921,
1922,
1923,
1924,
1925,
1926,
1927,
1928,
1929,
1930,
1931,
1932,
1933,
1934,
1935,
1936,
1937,
1938,
1947,
1948,
1954
],
"y": [
3,
7,
12,
3,
4,
4,
7,
12,
14,
12,
9,
11,
16,
10,
9,
4,
8,
2,
2,
2,
3,
7,
1,
1,
1,
1
]
},
{
"name": "Not digitised",
"type": "bar",
"x": [
1916,
1917,
1918,
1919,
1920,
1921,
1922,
1923,
1924,
1925,
1926,
1927,
1928,
1929,
1930,
1931,
1932,
1933,
1934,
1935,
1936,
1937,
1938,
1939,
1940,
1941,
1942,
1943,
1944,
1945,
1946,
1947,
1948,
1949,
1950,
1955
],
"y": [
1,
64,
242,
244,
271,
294,
311,
303,
279,
260,
315,
355,
336,
305,
286,
212,
180,
133,
129,
116,
111,
109,
97,
72,
54,
34,
3,
3,
3,
3,
24,
38,
5,
2,
1,
3
]
}
],
"layout": {
"barmode": "stack",
"title": "Content dates",
"xaxis": {
"title": "Year"
},
"yaxis": {
"title": "Number of items"
}
}
},
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fig = series_details.plot_dates(df)\n",
"py.iplot(fig, filename='series-dates-bar')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## View word frequencies"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# Combine all of the file titles into a single string\n",
"title_text = a = df['title'].str.lower().str.cat(sep=' ')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" word | \n",
" count | \n",
"
\n",
" \n",
" 0 | \n",
" name | \n",
" 2,565 | \n",
"
\n",
" 3 | \n",
" nationality | \n",
" 2,542 | \n",
"
\n",
" 5 | \n",
" birthplace | \n",
" 2,460 | \n",
"
\n",
" 12 | \n",
" number | \n",
" 2,323 | \n",
"
\n",
" 7 | \n",
" certificate | \n",
" 2,322 | \n",
"
\n",
" 11 | \n",
" cedt | \n",
" 2,315 | \n",
"
\n",
" 9 | \n",
" dictation | \n",
" 2,313 | \n",
"
\n",
" 10 | \n",
" test | \n",
" 2,313 | \n",
"
\n",
" 8 | \n",
" exemption | \n",
" 2,312 | \n",
"
\n",
" 4 | \n",
" chinese | \n",
" 2,189 | \n",
"
\n",
" 6 | \n",
" canton | \n",
" 1,950 | \n",
"
\n",
" 16 | \n",
" townsville | \n",
" 852 | \n",
"
\n",
" 18 | \n",
" ah | \n",
" 447 | \n",
"
\n",
" 73 | \n",
" lee | \n",
" 242 | \n",
"
\n",
" 174 | \n",
" japanese | \n",
" 195 | \n",
"
\n",
" 175 | \n",
" japan | \n",
" 177 | \n",
"
\n",
" 36 | \n",
" chong | \n",
" 129 | \n",
"
\n",
" 89 | \n",
" indian | \n",
" 122 | \n",
"
\n",
" 93 | \n",
" sing | \n",
" 121 | \n",
"
\n",
" 145 | \n",
" wong | \n",
" 112 | \n",
"
\n",
" 77 | \n",
" leong | \n",
" 112 | \n",
"
\n",
" 2 | \n",
" yee | \n",
" 110 | \n",
"
\n",
" 1 | \n",
" lum | \n",
" 109 | \n",
"
\n",
" 15 | \n",
" wah | \n",
" 104 | \n",
"
\n",
" 171 | \n",
" india | \n",
" 97 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"series_details.display_word_counts(title_text)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" ngram | \n",
" count | \n",
"
\n",
" \n",
" 0 | \n",
" cedt number | \n",
" 2,315 | \n",
"
\n",
" 1 | \n",
" dictation test | \n",
" 2,313 | \n",
"
\n",
" 2 | \n",
" the dictation | \n",
" 2,312 | \n",
"
\n",
" 3 | \n",
" certificate of | \n",
" 2,312 | \n",
"
\n",
" 4 | \n",
" of exemption | \n",
" 2,312 | \n",
"
\n",
" 5 | \n",
" from the | \n",
" 2,312 | \n",
"
\n",
" 6 | \n",
" exemption from | \n",
" 2,312 | \n",
"
\n",
" 7 | \n",
" test cedt | \n",
" 2,312 | \n",
"
\n",
" 8 | \n",
" nationality chinese | \n",
" 2,171 | \n",
"
\n",
" 9 | \n",
" chinese birthplace | \n",
" 2,109 | \n",
"
\n",
" 10 | \n",
" birthplace canton | \n",
" 1,949 | \n",
"
\n",
" 11 | \n",
" canton certificate | \n",
" 1,854 | \n",
"
\n",
" 12 | \n",
" of townsville | \n",
" 832 | \n",
"
\n",
" 13 | \n",
" townsville nationality | \n",
" 830 | \n",
"
\n",
" 14 | \n",
" name ah | \n",
" 322 | \n",
"
\n",
" 15 | \n",
" nationality japanese | \n",
" 195 | \n",
"
\n",
" 16 | \n",
" japanese birthplace | \n",
" 182 | \n",
"
\n",
" 17 | \n",
" japan certificate | \n",
" 166 | \n",
"
\n",
" 18 | \n",
" birthplace japan | \n",
" 165 | \n",
"
\n",
" 19 | \n",
" name lee | \n",
" 149 | \n",
"
\n",
" 20 | \n",
" nationality indian | \n",
" 117 | \n",
"
\n",
" 21 | \n",
" indian birthplace | \n",
" 111 | \n",
"
\n",
" 22 | \n",
" canton name | \n",
" 93 | \n",
"
\n",
" 23 | \n",
" india certificate | \n",
" 91 | \n",
"
\n",
" 24 | \n",
" name leong | \n",
" 87 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Change ngram_count for larger ngrams (trigrams etc)\n",
"ngram_count = 2\n",
"series_details.display_top_ngrams(title_text, ngram_count)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}