{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"series = 'B13'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import os\n",
"import pandas as pd\n",
"import series_details\n",
"import plotly.offline as py\n",
"py.init_notebook_mode()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"
National Archives of Australia: Series B13
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"General and classified correspondence, annual single number series
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Total items | 20,194 |
---|
Access status | |
---|
Open | 19,786 (97.98%) |
Not yet examined | 400 (1.98%) |
Open with exception | 8 (0.04%) |
Number of items digitised | 354 (1.75%) |
---|
Number of pages digitised | 5,043 |
---|
Date of earliest content | 1800 |
---|
Date of latest content | 2005 |
---|
Download the complete CSV file
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"series_details.display_summary(series, df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Content preview"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" identifier | \n",
" series | \n",
" control_symbol | \n",
" title | \n",
" contents_dates | \n",
" start_date | \n",
" end_date | \n",
" access_status | \n",
" location | \n",
" digitised_status | \n",
" digitised_pages | \n",
"
\n",
" \n",
" 0 | \n",
" 787258 | \n",
" B13 | \n",
" 1924/7516 | \n",
" Charlie Lam Sun (Charlie Shack Mayberry) - Arrived Sydney per \"Taiyuan\" 15.3.1924 | \n",
" 1924 - circa1924 | \n",
" 1924-01-01 00:00:00 | \n",
" NaT | \n",
" Open | \n",
" Melbourne | \n",
" False | \n",
" 0 | \n",
"
\n",
" 1 | \n",
" 790335 | \n",
" B13 | \n",
" 1926/6755 | \n",
" Edward Traynor - permission to enter Australia - arrived per \"Beltana\" 13.5.1926 | \n",
" 1926 - 1926 | \n",
" 1926-01-01 00:00:00 | \n",
" 1926-01-01 00:00:00 | \n",
" Open | \n",
" Melbourne | \n",
" False | \n",
" 0 | \n",
"
\n",
" 2 | \n",
" 3280504 | \n",
" B13 | \n",
" V1960/14261 | \n",
" Tabacco sales in Victoria [1.00 cms] | \n",
" 1960 - 1962 | \n",
" 1960-01-01 00:00:00 | \n",
" 1962-01-01 00:00:00 | \n",
" Open | \n",
" Melbourne | \n",
" False | \n",
" 0 | \n",
"
\n",
" 3 | \n",
" 3280538 | \n",
" B13 | \n",
" V1979/4475 | \n",
" James Richardson Co Pty Ltd, Licensed Warehouse, Richardsons Bond [Contains plans of Richardssons Bons] [4.00 cms] | \n",
" 1963 - 1984 | \n",
" 1963-01-01 00:00:00 | \n",
" 1984-01-01 00:00:00 | \n",
" Open with exception | \n",
" Melbourne | \n",
" False | \n",
" 0 | \n",
"
\n",
" 4 | \n",
" 3283801 | \n",
" B13 | \n",
" V1953/12491 | \n",
" Tobacco & Cigarettes - Duty Free issues to Ships Crews [2cm] | \n",
" 1945 - 1957 | \n",
" 1945-01-01 00:00:00 | \n",
" 1957-01-01 00:00:00 | \n",
" Open | \n",
" Melbourne | \n",
" False | \n",
" 0 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Change the number_of_rows value to see more\n",
"number_of_rows = 5\n",
"\n",
"# Display dataframe \n",
"df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector=\"th\", props=[(\"text-align\", \"center\")]),\n",
" dict(selector='.row_heading, .blank', props=[('display', 'none')])])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Plot content dates"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"data": [
{
"name": "Digitised",
"type": "bar",
"x": [
1907,
1908,
1909,
1910,
1911,
1912,
1913,
1914,
1915,
1916,
1917,
1918,
1919,
1920,
1921,
1922,
1923,
1924,
1925,
1926,
1927,
1928,
1929,
1930,
1931,
1932,
1933,
1934,
1935,
1936,
1937,
1938,
1939,
1940,
1941,
1942,
1943,
1944,
1945,
1946,
1947,
1948,
1949,
1950,
1951,
1952,
1953,
1954,
1955,
1956,
1957,
1958,
1959,
1960,
1961,
1962,
1963,
1964,
1965,
1966,
1967,
1968,
1969,
1970,
1971,
1972
],
"y": [
1,
1,
1,
3,
2,
7,
6,
6,
12,
9,
7,
15,
11,
12,
11,
29,
19,
21,
25,
28,
30,
26,
31,
22,
31,
19,
21,
24,
30,
19,
17,
4,
5,
9,
2,
2,
2,
2,
3,
2,
1,
1,
1,
1,
1,
1,
1,
1,
2,
3,
3,
3,
3,
3,
3,
3,
3,
3,
1,
1,
1,
1,
1,
1,
1,
1
]
},
{
"name": "Not digitised",
"type": "bar",
"x": [
1800,
1898,
1899,
1900,
1901,
1902,
1903,
1904,
1905,
1906,
1907,
1908,
1909,
1910,
1911,
1912,
1913,
1914,
1915,
1916,
1917,
1918,
1919,
1920,
1921,
1922,
1923,
1924,
1925,
1926,
1927,
1928,
1929,
1930,
1931,
1932,
1933,
1934,
1935,
1936,
1937,
1938,
1939,
1940,
1941,
1942,
1943,
1944,
1945,
1946,
1947,
1948,
1949,
1950,
1951,
1952,
1953,
1954,
1955,
1956,
1957,
1958,
1959,
1960,
1961,
1962,
1963,
1964,
1965,
1966,
1967,
1968,
1969,
1970,
1971,
1972,
1973,
1974,
1975,
1976,
1977,
1978,
1979,
1980,
1981,
1982,
1983,
1984,
1985,
1986,
1987,
1988,
1989,
1990,
1991,
1992,
1993,
1994,
1995,
1996,
1997,
1998,
1999,
2000,
2001,
2002,
2003,
2004,
2005
],
"y": [
1,
2,
2,
3,
4,
34,
20,
12,
13,
16,
16,
18,
32,
44,
48,
106,
100,
90,
118,
82,
79,
73,
102,
124,
137,
748,
901,
1023,
1129,
1470,
1401,
1302,
1182,
1106,
967,
829,
918,
925,
943,
991,
1059,
151,
976,
1563,
272,
209,
160,
172,
172,
249,
116,
120,
133,
145,
156,
165,
179,
193,
198,
218,
235,
248,
263,
267,
270,
275,
277,
277,
284,
285,
288,
287,
291,
295,
295,
279,
275,
255,
244,
225,
205,
187,
172,
171,
151,
134,
122,
116,
92,
76,
62,
53,
41,
33,
23,
20,
22,
19,
24,
14,
10,
7,
6,
4,
3,
2,
1,
1,
1
]
}
],
"layout": {
"barmode": "stack",
"title": "Content dates",
"xaxis": {
"title": "Year"
},
"yaxis": {
"title": "Number of items"
}
}
},
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fig = series_details.plot_dates(df)\n",
"py.iplot(fig, filename='series-dates-bar')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## View word frequencies"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# Combine all of the file titles into a single string\n",
"title_text = a = df['title'].str.lower().str.cat(sep=' ')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" word | \n",
" count | \n",
"
\n",
" \n",
" 7 | \n",
" per | \n",
" 5,172 | \n",
"
\n",
" 314 | \n",
" ex | \n",
" 4,363 | \n",
"
\n",
" 902 | \n",
" exemption | \n",
" 3,688 | \n",
"
\n",
" 618 | \n",
" certificate | \n",
" 3,577 | \n",
"
\n",
" 1655 | \n",
" dictation | \n",
" 3,577 | \n",
"
\n",
" 1581 | \n",
" test | \n",
" 3,553 | \n",
"
\n",
" 71 | \n",
" melbourne | \n",
" 3,168 | \n",
"
\n",
" 538 | \n",
" application | \n",
" 2,442 | \n",
"
\n",
" 174 | \n",
" departure | \n",
" 2,006 | \n",
"
\n",
" 14 | \n",
" australia | \n",
" 1,977 | \n",
"
\n",
" 1543 | \n",
" ah | \n",
" 1,796 | \n",
"
\n",
" 949 | \n",
" passengers | \n",
" 1,620 | \n",
"
\n",
" 173 | \n",
" arrival | \n",
" 1,560 | \n",
"
\n",
" 26 | \n",
" ltd | \n",
" 1,446 | \n",
"
\n",
" 104 | \n",
" act | \n",
" 1,180 | \n",
"
\n",
" 1482 | \n",
" mrs | \n",
" 1,175 | \n",
"
\n",
" 6 | \n",
" sydney | \n",
" 1,075 | \n",
"
\n",
" 1861 | \n",
" s.s | \n",
" 1,074 | \n",
"
\n",
" 12 | \n",
" permission | \n",
" 1,050 | \n",
"
\n",
" 830 | \n",
" crew | \n",
" 1,015 | \n",
"
\n",
" 25 | \n",
" pty | \n",
" 950 | \n",
"
\n",
" 1621 | \n",
" applied | \n",
" 927 | \n",
"
\n",
" 1583 | \n",
" chinese | \n",
" 862 | \n",
"
\n",
" 2049 | \n",
" enemy | \n",
" 858 | \n",
"
\n",
" 24 | \n",
" co | \n",
" 835 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"series_details.display_word_counts(title_text)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" ngram | \n",
" count | \n",
"
\n",
" \n",
" 0 | \n",
" exemption from | \n",
" 3,550 | \n",
"
\n",
" 1 | \n",
" from dictation | \n",
" 3,540 | \n",
"
\n",
" 2 | \n",
" dictation test | \n",
" 3,533 | \n",
"
\n",
" 3 | \n",
" for exemption | \n",
" 3,004 | \n",
"
\n",
" 4 | \n",
" certificate for | \n",
" 2,872 | \n",
"
\n",
" 5 | \n",
" for certificate | \n",
" 2,660 | \n",
"
\n",
" 6 | \n",
" application for | \n",
" 2,236 | \n",
"
\n",
" 7 | \n",
" melbourne per | \n",
" 1,054 | \n",
"
\n",
" 8 | \n",
" departure per | \n",
" 1,009 | \n",
"
\n",
" 9 | \n",
" pty ltd | \n",
" 927 | \n",
"
\n",
" 10 | \n",
" applied for | \n",
" 923 | \n",
"
\n",
" 11 | \n",
" to australia | \n",
" 859 | \n",
"
\n",
" 12 | \n",
" trading with | \n",
" 782 | \n",
"
\n",
" 13 | \n",
" enemy act | \n",
" 765 | \n",
"
\n",
" 14 | \n",
" with enemy | \n",
" 764 | \n",
"
\n",
" 15 | \n",
" test ah | \n",
" 735 | \n",
"
\n",
" 16 | \n",
" permission to | \n",
" 698 | \n",
"
\n",
" 17 | \n",
" act 1939 | \n",
" 691 | \n",
"
\n",
" 18 | \n",
" of exemption | \n",
" 636 | \n",
"
\n",
" 19 | \n",
" certificate of | \n",
" 625 | \n",
"
\n",
" 20 | \n",
" crew member | \n",
" 623 | \n",
"
\n",
" 21 | \n",
" arrival per | \n",
" 458 | \n",
"
\n",
" 22 | \n",
" to enter | \n",
" 437 | \n",
"
\n",
" 23 | \n",
" of certificate | \n",
" 419 | \n",
"
\n",
" 24 | \n",
" passengers melbourne | \n",
" 390 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Change ngram_count for larger ngrams (trigrams etc)\n",
"ngram_count = 2\n",
"series_details.display_top_ngrams(title_text, ngram_count)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}