{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"series = 'ST84/1'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import os\n",
"import pandas as pd\n",
"import series_details\n",
"import plotly.offline as py\n",
"py.init_notebook_mode()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"
National Archives of Australia: Series ST84/1
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Certificates of Domicile and Certificates of Exemption from Dictation Test, chronological series
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Total items | 2,765 |
---|
Access status | |
---|
Open | 2,758 (99.75%) |
Not yet examined | 7 (0.25%) |
Number of items digitised | 434 (15.70%) |
---|
Number of pages digitised | 13,979 |
---|
Date of earliest content | 1855 |
---|
Date of latest content | 1975 |
---|
Download the complete CSV file
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"series_details.display_summary(series, df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Content preview"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" identifier | \n",
" series | \n",
" control_symbol | \n",
" title | \n",
" contents_dates | \n",
" start_date | \n",
" end_date | \n",
" access_status | \n",
" location | \n",
" digitised_status | \n",
" digitised_pages | \n",
"
\n",
" \n",
" 0 | \n",
" 1731871 | \n",
" ST84/1 | \n",
" 1907/391-400 | \n",
" James Lee Chong, Way Sing, Walter Hing Hee, Ah See, Charlie Joy, Nicholas Saseen, Foo Jun, Hop Sing, Sun Sing Lee and Jack Hoy [Certificate Exempting from Dictation Test - includes left hand impression and photographs] [box 16] | \n",
" 1907 - 1907 | \n",
" 1907-01-01 00:00:00 | \n",
" 1907-01-01 00:00:00 | \n",
" Open | \n",
" Sydney | \n",
" True | \n",
" 34 | \n",
"
\n",
" 1 | \n",
" 7288001 | \n",
" ST84/1 | \n",
" 1919/270/81-90 | \n",
" Jong Say, Wong Kwong, Lee You Wing, Foo Gun, Mar Kum, Gock Buck, Ah Get, Jeong Keong, Percy Zuinn and Ah Yum [Certificate Exempting from Dictation Test - includes left hand impression and photographs] [box 122] | \n",
" 1919 - 1919 | \n",
" 1919-01-01 00:00:00 | \n",
" 1919-01-01 00:00:00 | \n",
" Open | \n",
" Sydney | \n",
" False | \n",
" 0 | \n",
"
\n",
" 2 | \n",
" 7288002 | \n",
" ST84/1 | \n",
" 1919/270/91-100 | \n",
" Ming Gar, Ah Loong, Lun Soy, Gung Sun, Ah Lock, John Nop or Jan Nap, Peter Sing, Louie Wee, Sue Hoo and Lee Yuen [Certificate Exempting from Dictation Test - includes left hand impression and photographs] [box 122] | \n",
" 1919 - 1919 | \n",
" 1919-01-01 00:00:00 | \n",
" 1919-01-01 00:00:00 | \n",
" Open | \n",
" Sydney | \n",
" False | \n",
" 0 | \n",
"
\n",
" 3 | \n",
" 7288003 | \n",
" ST84/1 | \n",
" 1919/271/1-10 | \n",
" Charley Eip, Chew Bun, Lee Chut, Lum Gow, Tommy Low, Low Zuai, Charlie Gong or Charlie Kwong, Lee So, Chi Wort and Go Foo [Certificate Exempting from Dictation Test - includes left hand impression and photographs] [box 123] | \n",
" 1919 - 1919 | \n",
" 1919-01-01 00:00:00 | \n",
" 1919-01-01 00:00:00 | \n",
" Open | \n",
" Sydney | \n",
" False | \n",
" 0 | \n",
"
\n",
" 4 | \n",
" 7288004 | \n",
" ST84/1 | \n",
" 1919/271/11-20 | \n",
" Dewan Singh, Joseph Sequiera, Lee Gum Sue, Ah Suey, Fong Foon, Ah Seck, Man Duck, Lee Tim and Ah Moon [Certificate Exempting from Dictation Test - includes left hand impression and photographs] [box 123] | \n",
" 1919 - 1919 | \n",
" 1919-01-01 00:00:00 | \n",
" 1919-01-01 00:00:00 | \n",
" Open | \n",
" Sydney | \n",
" False | \n",
" 0 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Change the number_of_rows value to see more\n",
"number_of_rows = 5\n",
"\n",
"# Display dataframe \n",
"df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector=\"th\", props=[(\"text-align\", \"center\")]),\n",
" dict(selector='.row_heading, .blank', props=[('display', 'none')])])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Plot content dates"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"data": [
{
"name": "Digitised",
"type": "bar",
"x": [
1855,
1856,
1857,
1858,
1859,
1860,
1861,
1862,
1863,
1864,
1865,
1866,
1867,
1868,
1869,
1870,
1871,
1872,
1873,
1874,
1875,
1876,
1877,
1878,
1879,
1880,
1881,
1882,
1883,
1884,
1885,
1886,
1887,
1888,
1889,
1890,
1891,
1892,
1893,
1894,
1895,
1896,
1897,
1898,
1899,
1900,
1901,
1902,
1903,
1904,
1905,
1906,
1907,
1908,
1909,
1910,
1911,
1912,
1913,
1914,
1915,
1916,
1917,
1918,
1919,
1920,
1921,
1922,
1923,
1924,
1925,
1926,
1927,
1928,
1929,
1930,
1931,
1932,
1933,
1934,
1935,
1936,
1937,
1938,
1939,
1940,
1941,
1942,
1943,
1944,
1945,
1946,
1947,
1948,
1949,
1950,
1951,
1952,
1953
],
"y": [
1,
1,
1,
2,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
3,
5,
6,
6,
13,
13,
14,
18,
24,
27,
35,
38,
38,
38,
38,
38,
38,
38,
38,
38,
38,
39,
40,
40,
40,
40,
40,
40,
40,
40,
40,
29,
37,
46,
58,
72,
54,
17,
7,
10,
4,
6,
5,
7,
4,
6,
5,
7,
4,
5,
3,
2,
1,
3,
5,
3,
7,
4,
4,
1,
2,
2,
3,
3,
4,
3,
3,
3,
2,
2,
1,
1,
1,
6,
1,
2,
1,
1,
1,
1,
1
]
},
{
"name": "Not digitised",
"type": "bar",
"x": [
1881,
1882,
1883,
1884,
1885,
1886,
1887,
1888,
1889,
1890,
1891,
1892,
1893,
1894,
1895,
1896,
1897,
1898,
1899,
1900,
1901,
1902,
1903,
1904,
1905,
1906,
1907,
1908,
1909,
1910,
1911,
1912,
1913,
1914,
1915,
1916,
1917,
1918,
1919,
1920,
1921,
1922,
1923,
1924,
1925,
1926,
1927,
1928,
1929,
1930,
1931,
1932,
1933,
1934,
1935,
1936,
1937,
1938,
1939,
1940,
1941,
1942,
1943,
1944,
1945,
1946,
1947,
1948,
1949,
1950,
1951,
1952,
1953,
1975
],
"y": [
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
2,
2,
2,
2,
2,
5,
2,
2,
4,
3,
26,
74,
83,
97,
111,
103,
111,
95,
73,
63,
83,
83,
113,
85,
100,
98,
77,
99,
102,
85,
74,
71,
38,
38,
38,
39,
38,
42,
36,
29,
29,
23,
22,
2,
2,
2,
2,
36,
27,
9,
9,
11,
6,
3,
2,
1
]
}
],
"layout": {
"barmode": "stack",
"title": "Content dates",
"xaxis": {
"title": "Year"
},
"yaxis": {
"title": "Number of items"
}
}
},
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fig = series_details.plot_dates(df)\n",
"py.iplot(fig, filename='series-dates-bar')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## View word frequencies"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Combine all of the file titles into a single string\n",
"title_text = a = df['title'].str.lower().str.cat(sep=' ')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" word | \n",
" count | \n",
"
\n",
" \n",
" 8 | \n",
" ah | \n",
" 6,070 | \n",
"
\n",
" 29 | \n",
" box | \n",
" 2,830 | \n",
"
\n",
" 20 | \n",
" certificate | \n",
" 2,747 | \n",
"
\n",
" 1 | \n",
" lee | \n",
" 2,737 | \n",
"
\n",
" 24 | \n",
" includes | \n",
" 2,735 | \n",
"
\n",
" 26 | \n",
" hand | \n",
" 2,732 | \n",
"
\n",
" 25 | \n",
" left | \n",
" 2,732 | \n",
"
\n",
" 27 | \n",
" impression | \n",
" 2,732 | \n",
"
\n",
" 28 | \n",
" photographs | \n",
" 2,715 | \n",
"
\n",
" 21 | \n",
" exempting | \n",
" 2,646 | \n",
"
\n",
" 23 | \n",
" test | \n",
" 2,646 | \n",
"
\n",
" 22 | \n",
" dictation | \n",
" 2,646 | \n",
"
\n",
" 2 | \n",
" chong | \n",
" 1,228 | \n",
"
\n",
" 4 | \n",
" sing | \n",
" 1,171 | \n",
"
\n",
" 33 | \n",
" wong | \n",
" 1,060 | \n",
"
\n",
" 107 | \n",
" young | \n",
" 1,000 | \n",
"
\n",
" 108 | \n",
" yee | \n",
" 934 | \n",
"
\n",
" 110 | \n",
" george | \n",
" 702 | \n",
"
\n",
" 170 | \n",
" choy | \n",
" 679 | \n",
"
\n",
" 6 | \n",
" hing | \n",
" 653 | \n",
"
\n",
" 73 | \n",
" low | \n",
" 650 | \n",
"
\n",
" 10 | \n",
" charlie | \n",
" 612 | \n",
"
\n",
" 70 | \n",
" lum | \n",
" 609 | \n",
"
\n",
" 86 | \n",
" fong | \n",
" 580 | \n",
"
\n",
" 95 | \n",
" gee | \n",
" 575 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"series_details.display_word_counts(title_text)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" ngram | \n",
" count | \n",
"
\n",
" \n",
" 0 | \n",
" includes left | \n",
" 2,732 | \n",
"
\n",
" 1 | \n",
" hand impression | \n",
" 2,732 | \n",
"
\n",
" 2 | \n",
" left hand | \n",
" 2,732 | \n",
"
\n",
" 3 | \n",
" photographs box | \n",
" 2,715 | \n",
"
\n",
" 4 | \n",
" impression and | \n",
" 2,712 | \n",
"
\n",
" 5 | \n",
" and photographs | \n",
" 2,712 | \n",
"
\n",
" 6 | \n",
" dictation test | \n",
" 2,646 | \n",
"
\n",
" 7 | \n",
" exempting from | \n",
" 2,646 | \n",
"
\n",
" 8 | \n",
" from dictation | \n",
" 2,646 | \n",
"
\n",
" 9 | \n",
" certificate exempting | \n",
" 2,646 | \n",
"
\n",
" 10 | \n",
" test includes | \n",
" 2,639 | \n",
"
\n",
" 11 | \n",
" and ah | \n",
" 518 | \n",
"
\n",
" 12 | \n",
" or ah | \n",
" 326 | \n",
"
\n",
" 13 | \n",
" lee ah | \n",
" 199 | \n",
"
\n",
" 14 | \n",
" ah chong | \n",
" 173 | \n",
"
\n",
" 15 | \n",
" ah sing | \n",
" 172 | \n",
"
\n",
" 16 | \n",
" sing ah | \n",
" 166 | \n",
"
\n",
" 17 | \n",
" ah sam | \n",
" 165 | \n",
"
\n",
" 18 | \n",
" and lee | \n",
" 152 | \n",
"
\n",
" 19 | \n",
" chong ah | \n",
" 145 | \n",
"
\n",
" 20 | \n",
" sydney nsw | \n",
" 128 | \n",
"
\n",
" 21 | \n",
" lee and | \n",
" 128 | \n",
"
\n",
" 22 | \n",
" lee certificate | \n",
" 125 | \n",
"
\n",
" 23 | \n",
" ah tong | \n",
" 110 | \n",
"
\n",
" 24 | \n",
" sing and | \n",
" 104 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Change ngram_count for larger ngrams (trigrams etc)\n",
"ngram_count = 2\n",
"series_details.display_top_ngrams(title_text, ngram_count)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}