{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"series = 'SP42/1'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import os\n",
"import pandas as pd\n",
"import series_details\n",
"import plotly.offline as py\n",
"py.init_notebook_mode()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"
National Archives of Australia: Series SP42/1
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Correspondence of the Collector of Customs relating to Immigration Restriction and Passports
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Total items | 16,256 |
---|
Access status | |
---|
Open | 15,525 (95.50%) |
Not yet examined | 731 (4.50%) |
Number of items digitised | 3,253 (20.01%) |
---|
Number of pages digitised | 45,862 |
---|
Date of earliest content | 1881 |
---|
Date of latest content | 1960 |
---|
Download the complete CSV file
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"series_details.display_summary(series, df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Content preview"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" identifier | \n",
" series | \n",
" control_symbol | \n",
" title | \n",
" contents_dates | \n",
" start_date | \n",
" end_date | \n",
" access_status | \n",
" location | \n",
" digitised_status | \n",
" digitised_pages | \n",
"
\n",
" \n",
" 0 | \n",
" 1053878 | \n",
" SP42/1 | \n",
" B1906/694 | \n",
" AH KIM [correspondence of the Collector of Customs relating to immigration restrictions] [6 pages] [box 15] | \n",
" 1906 - 1906 | \n",
" 1906-01-01 00:00:00 | \n",
" 1906-01-01 00:00:00 | \n",
" Open | \n",
" Sydney | \n",
" True | \n",
" 6 | \n",
"
\n",
" 1 | \n",
" 1563661 | \n",
" SP42/1 | \n",
" B1905/1553 | \n",
" Ah Kong, includes photographs | \n",
" 1905 - circa1905 | \n",
" 1905-01-01 00:00:00 | \n",
" NaT | \n",
" Open | \n",
" Sydney | \n",
" True | \n",
" 10 | \n",
"
\n",
" 2 | \n",
" 1563665 | \n",
" SP42/1 | \n",
" B1905/1557 | \n",
" Ah Yet, includes photographs | \n",
" 1905 - 1905 | \n",
" 1905-01-01 00:00:00 | \n",
" 1905-01-01 00:00:00 | \n",
" Open | \n",
" Sydney | \n",
" True | \n",
" 12 | \n",
"
\n",
" 3 | \n",
" 1563670 | \n",
" SP42/1 | \n",
" B1905/1561 | \n",
" You Gee | \n",
" 1905 - 1905 | \n",
" 1905-01-01 00:00:00 | \n",
" 1905-01-01 00:00:00 | \n",
" Open | \n",
" Sydney | \n",
" True | \n",
" 8 | \n",
"
\n",
" 4 | \n",
" 1563675 | \n",
" SP42/1 | \n",
" B1905/1565 | \n",
" Deserters from the RMS INDIA; Meer Afzul and Ackbar Carrandad | \n",
" 1905 - 1905 | \n",
" 1905-01-01 00:00:00 | \n",
" 1905-01-01 00:00:00 | \n",
" Open | \n",
" Sydney | \n",
" True | \n",
" 17 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Change the number_of_rows value to see more\n",
"number_of_rows = 5\n",
"\n",
"# Display dataframe \n",
"df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector=\"th\", props=[(\"text-align\", \"center\")]),\n",
" dict(selector='.row_heading, .blank', props=[('display', 'none')])])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Plot content dates"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"data": [
{
"name": "Digitised",
"type": "bar",
"x": [
1881,
1882,
1883,
1884,
1885,
1886,
1887,
1888,
1889,
1890,
1891,
1892,
1893,
1894,
1895,
1896,
1897,
1898,
1899,
1900,
1901,
1902,
1903,
1904,
1905,
1906,
1907,
1908,
1909,
1910,
1911,
1912,
1913,
1914,
1915,
1916,
1917,
1918,
1919,
1920,
1921,
1922,
1923,
1924,
1925,
1926,
1927,
1928,
1929,
1930,
1931,
1932,
1933,
1934,
1935,
1936,
1937,
1938,
1939,
1940,
1941,
1942,
1943,
1944,
1945,
1946,
1947,
1953
],
"y": [
1,
2,
7,
12,
14,
14,
14,
15,
16,
16,
16,
19,
19,
19,
19,
19,
20,
21,
30,
41,
57,
88,
296,
283,
384,
319,
465,
518,
630,
762,
708,
670,
556,
607,
675,
741,
657,
544,
451,
247,
98,
91,
87,
82,
84,
78,
71,
63,
61,
56,
51,
48,
43,
37,
34,
34,
37,
32,
33,
18,
11,
10,
7,
8,
4,
3,
1,
1
]
},
{
"name": "Not digitised",
"type": "bar",
"x": [
1882,
1883,
1884,
1885,
1886,
1887,
1888,
1889,
1890,
1891,
1892,
1893,
1894,
1895,
1896,
1897,
1898,
1899,
1900,
1901,
1902,
1903,
1904,
1905,
1906,
1907,
1908,
1909,
1910,
1911,
1912,
1913,
1914,
1915,
1916,
1917,
1918,
1919,
1920,
1921,
1922,
1923,
1924,
1925,
1926,
1927,
1928,
1929,
1930,
1931,
1932,
1933,
1934,
1935,
1936,
1937,
1938,
1939,
1940,
1941,
1942,
1943,
1944,
1945,
1946,
1947,
1948,
1949,
1950,
1951,
1952,
1953,
1954,
1955,
1956,
1957,
1960
],
"y": [
1,
1,
2,
2,
2,
3,
3,
3,
3,
3,
3,
3,
4,
5,
6,
6,
8,
10,
14,
19,
34,
137,
265,
429,
691,
967,
1260,
1605,
1888,
2244,
2616,
3060,
3236,
3358,
3393,
3542,
3679,
3853,
4021,
4254,
4344,
4200,
3940,
3823,
3709,
3403,
3099,
2753,
2417,
2120,
1932,
1731,
1565,
1486,
1617,
1729,
1533,
1365,
996,
713,
473,
773,
1276,
479,
129,
26,
14,
11,
10,
10,
10,
9,
12,
7,
4,
4,
1
]
}
],
"layout": {
"barmode": "stack",
"title": "Content dates",
"xaxis": {
"title": "Year"
},
"yaxis": {
"title": "Number of items"
}
}
},
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fig = series_details.plot_dates(df)\n",
"py.iplot(fig, filename='series-dates-bar')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## View word frequencies"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Combine all of the file titles into a single string\n",
"title_text = a = df['title'].str.lower().str.cat(sep=' ')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" word | \n",
" count | \n",
"
\n",
" \n",
" 10 | \n",
" box | \n",
" 14,077 | \n",
"
\n",
" 13 | \n",
" includes | \n",
" 9,891 | \n",
"
\n",
" 43 | \n",
" left | \n",
" 9,844 | \n",
"
\n",
" 77 | \n",
" prints | \n",
" 7,827 | \n",
"
\n",
" 32 | \n",
" showing | \n",
" 7,181 | \n",
"
\n",
" 33 | \n",
" front | \n",
" 7,144 | \n",
"
\n",
" 203 | \n",
" ex | \n",
" 6,808 | \n",
"
\n",
" 34 | \n",
" side | \n",
" 6,695 | \n",
"
\n",
" 35 | \n",
" views | \n",
" 6,590 | \n",
"
\n",
" 14 | \n",
" photographs | \n",
" 6,521 | \n",
"
\n",
" 1055 | \n",
" thumb | \n",
" 6,171 | \n",
"
\n",
" 235 | \n",
" right | \n",
" 6,149 | \n",
"
\n",
" 158 | \n",
" sydney | \n",
" 4,292 | \n",
"
\n",
" 1047 | \n",
" subject | \n",
" 3,745 | \n",
"
\n",
" 0 | \n",
" ah | \n",
" 3,585 | \n",
"
\n",
" 39 | \n",
" also | \n",
" 3,409 | \n",
"
\n",
" 40 | \n",
" known | \n",
" 3,318 | \n",
"
\n",
" 1052 | \n",
" arrived | \n",
" 2,935 | \n",
"
\n",
" 1063 | \n",
" issue | \n",
" 2,843 | \n",
"
\n",
" 1064 | \n",
" favour | \n",
" 2,795 | \n",
"
\n",
" 63 | \n",
" certificate | \n",
" 2,737 | \n",
"
\n",
" 44 | \n",
" hand | \n",
" 2,635 | \n",
"
\n",
" 31 | \n",
" 2 | \n",
" 2,503 | \n",
"
\n",
" 240 | \n",
" exemption | \n",
" 2,371 | \n",
"
\n",
" 1061 | \n",
" finger | \n",
" 1,978 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"series_details.display_word_counts(title_text)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" ngram | \n",
" count | \n",
"
\n",
" \n",
" 0 | \n",
" showing front | \n",
" 7,109 | \n",
"
\n",
" 1 | \n",
" and side | \n",
" 6,660 | \n",
"
\n",
" 2 | \n",
" front and | \n",
" 6,637 | \n",
"
\n",
" 3 | \n",
" side views | \n",
" 6,554 | \n",
"
\n",
" 4 | \n",
" and left | \n",
" 6,514 | \n",
"
\n",
" 5 | \n",
" and right | \n",
" 6,139 | \n",
"
\n",
" 6 | \n",
" left and | \n",
" 6,082 | \n",
"
\n",
" 7 | \n",
" photographs showing | \n",
" 6,072 | \n",
"
\n",
" 8 | \n",
" thumb prints | \n",
" 6,049 | \n",
"
\n",
" 9 | \n",
" right thumb | \n",
" 6,005 | \n",
"
\n",
" 10 | \n",
" known as | \n",
" 3,296 | \n",
"
\n",
" 11 | \n",
" also known | \n",
" 3,293 | \n",
"
\n",
" 12 | \n",
" views and | \n",
" 3,244 | \n",
"
\n",
" 13 | \n",
" sydney on | \n",
" 3,098 | \n",
"
\n",
" 14 | \n",
" prints box | \n",
" 2,994 | \n",
"
\n",
" 15 | \n",
" of subject | \n",
" 2,898 | \n",
"
\n",
" 16 | \n",
" issue of | \n",
" 2,836 | \n",
"
\n",
" 17 | \n",
" in favour | \n",
" 2,790 | \n",
"
\n",
" 18 | \n",
" favour of | \n",
" 2,778 | \n",
"
\n",
" 19 | \n",
" in sydney | \n",
" 2,741 | \n",
"
\n",
" 20 | \n",
" arrived ex | \n",
" 2,694 | \n",
"
\n",
" 21 | \n",
" subject box | \n",
" 2,596 | \n",
"
\n",
" 22 | \n",
" left hand | \n",
" 2,585 | \n",
"
\n",
" 23 | \n",
" prints and | \n",
" 2,262 | \n",
"
\n",
" 24 | \n",
" of exemption | \n",
" 2,052 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Change ngram_count for larger ngrams (trigrams etc)\n",
"ngram_count = 2\n",
"series_details.display_top_ngrams(title_text, ngram_count)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}