{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"series = 'A6122'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import os\n",
"import pandas as pd\n",
"import series_details\n",
"import plotly.offline as py\n",
"py.init_notebook_mode()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"
National Archives of Australia: Series A6122
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Subject files, multiple number series
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Total items | 2,819 |
---|
Access status | |
---|
Open with exception | 2,376 (84.29%) |
Open | 162 (5.75%) |
Closed | 138 (4.90%) |
Not yet examined | 137 (4.86%) |
Withheld pending agency advice | 6 (0.21%) |
Number of items digitised | 565 (20.04%) |
---|
Number of pages digitised | 69,007 |
---|
Date of earliest content | 1800 |
---|
Date of latest content | 1993 |
---|
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"series_details.display_summary(series, df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Content preview"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" identifier | \n",
" series | \n",
" control_symbol | \n",
" title | \n",
" contents_dates | \n",
" start_date | \n",
" end_date | \n",
" access_status | \n",
" location | \n",
" digitised_status | \n",
" digitised_pages | \n",
"
\n",
" \n",
" 0 | \n",
" 217109 | \n",
" A6122 | \n",
" 124 | \n",
" Comintern [Communist International] [includes papers on The League against Cruelties and Oppression in the Colonies] File ends 1966. | \n",
" 1927 - 1966 | \n",
" 1927-01-01 00:00:00 | \n",
" 1966-01-01 00:00:00 | \n",
" Open with exception | \n",
" Canberra | \n",
" False | \n",
" 0 | \n",
"
\n",
" 1 | \n",
" 217110 | \n",
" A6122 | \n",
" 125 | \n",
" Left Book Club | \n",
" 1938 - 1941 | \n",
" 1938-01-01 00:00:00 | \n",
" 1941-01-01 00:00:00 | \n",
" Open | \n",
" Canberra | \n",
" False | \n",
" 0 | \n",
"
\n",
" 2 | \n",
" 217111 | \n",
" A6122 | \n",
" 126 | \n",
" Industrial Workers of the World Volume 1 Part 1 | \n",
" 1916 - 1948 | \n",
" 1916-01-01 00:00:00 | \n",
" 1948-01-01 00:00:00 | \n",
" Open with exception | \n",
" Canberra | \n",
" True | \n",
" 253 | \n",
"
\n",
" 3 | \n",
" 217113 | \n",
" A6122 | \n",
" 127 | \n",
" Textile Workers Militant Committee | \n",
" 1944 - 1944 | \n",
" 1944-01-01 00:00:00 | \n",
" 1944-01-01 00:00:00 | \n",
" Open | \n",
" Canberra | \n",
" False | \n",
" 0 | \n",
"
\n",
" 4 | \n",
" 217114 | \n",
" A6122 | \n",
" 128 | \n",
" Irish Republican Army | \n",
" 1940 - 1944 | \n",
" 1940-01-01 00:00:00 | \n",
" 1944-01-01 00:00:00 | \n",
" Open | \n",
" Canberra | \n",
" True | \n",
" 63 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Change the number_of_rows value to see more\n",
"number_of_rows = 5\n",
"\n",
"# Display dataframe \n",
"df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector=\"th\", props=[(\"text-align\", \"center\")]),\n",
" dict(selector='.row_heading, .blank', props=[('display', 'none')])])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Plot content dates"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"data": [
{
"name": "Digitised",
"type": "bar",
"x": [
1800,
1916,
1917,
1918,
1919,
1920,
1921,
1922,
1923,
1924,
1925,
1926,
1927,
1928,
1929,
1930,
1931,
1932,
1933,
1934,
1935,
1936,
1937,
1938,
1939,
1940,
1941,
1942,
1943,
1944,
1945,
1946,
1947,
1948,
1949,
1950,
1951,
1952,
1953,
1954,
1955,
1956,
1957,
1958,
1959,
1960,
1961,
1962,
1963,
1964,
1965,
1966,
1967,
1968,
1969,
1970,
1971,
1972,
1973,
1974,
1975,
1976,
1977,
1978,
1979,
1980,
1981,
1982,
1983,
1984,
1985,
1986,
1987,
1988,
1989
],
"y": [
1,
2,
2,
5,
6,
5,
5,
6,
7,
5,
5,
5,
6,
8,
8,
8,
10,
11,
11,
12,
25,
27,
28,
31,
39,
43,
51,
53,
63,
66,
74,
82,
85,
85,
102,
98,
107,
114,
131,
125,
111,
118,
111,
126,
105,
104,
108,
95,
87,
83,
84,
88,
82,
69,
77,
77,
77,
85,
64,
48,
38,
44,
42,
48,
40,
33,
28,
22,
12,
10,
9,
6,
4,
4,
4
]
},
{
"name": "Not digitised",
"type": "bar",
"x": [
1800,
1901,
1902,
1903,
1904,
1905,
1906,
1907,
1908,
1909,
1910,
1911,
1912,
1913,
1914,
1915,
1916,
1917,
1918,
1919,
1920,
1921,
1922,
1923,
1924,
1925,
1926,
1927,
1928,
1929,
1930,
1931,
1932,
1933,
1934,
1935,
1936,
1937,
1938,
1939,
1940,
1941,
1942,
1943,
1944,
1945,
1946,
1947,
1948,
1949,
1950,
1951,
1952,
1953,
1954,
1955,
1956,
1957,
1958,
1959,
1960,
1961,
1962,
1963,
1964,
1965,
1966,
1967,
1968,
1969,
1970,
1971,
1972,
1973,
1974,
1975,
1976,
1977,
1978,
1979,
1980,
1981,
1982,
1983,
1984,
1985,
1986,
1987,
1988,
1989,
1990,
1991,
1992,
1993
],
"y": [
7,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
2,
10,
10,
10,
10,
11,
11,
11,
12,
11,
13,
16,
17,
17,
19,
21,
23,
25,
45,
50,
53,
55,
76,
102,
118,
127,
159,
176,
181,
201,
218,
240,
422,
415,
493,
607,
724,
740,
677,
482,
404,
390,
389,
359,
362,
338,
296,
251,
242,
181,
183,
179,
173,
172,
142,
111,
98,
86,
92,
94,
72,
53,
43,
42,
36,
33,
33,
23,
20,
15,
15,
9,
9,
7,
5,
3,
3
]
}
],
"layout": {
"barmode": "stack",
"title": "Content dates",
"xaxis": {
"title": "Year"
},
"yaxis": {
"title": "Number of items"
}
}
},
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fig = series_details.plot_dates(df)\n",
"py.iplot(fig, filename='series-dates-bar')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## View word frequencies"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# Combine all of the file titles into a single string\n",
"title_text = a = df['title'].str.lower().str.cat(sep=' ')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" word | \n",
" count | \n",
"
\n",
" \n",
" 42 | \n",
" australia | \n",
" 1,339 | \n",
"
\n",
" 18 | \n",
" volume | \n",
" 1,277 | \n",
"
\n",
" 45 | \n",
" party | \n",
" 1,078 | \n",
"
\n",
" 1 | \n",
" communist | \n",
" 1,061 | \n",
"
\n",
" 19 | \n",
" 1 | \n",
" 456 | \n",
"
\n",
" 44 | \n",
" cpa | \n",
" 418 | \n",
"
\n",
" 102 | \n",
" australian | \n",
" 359 | \n",
"
\n",
" 279 | \n",
" branch | \n",
" 334 | \n",
"
\n",
" 36 | \n",
" south | \n",
" 308 | \n",
"
\n",
" 64 | \n",
" 2 | \n",
" 278 | \n",
"
\n",
" 35 | \n",
" new | \n",
" 275 | \n",
"
\n",
" 226 | \n",
" nsw | \n",
" 231 | \n",
"
\n",
" 204 | \n",
" interest | \n",
" 222 | \n",
"
\n",
" 202 | \n",
" victoria | \n",
" 195 | \n",
"
\n",
" 37 | \n",
" wales | \n",
" 182 | \n",
"
\n",
" 46 | \n",
" queensland | \n",
" 167 | \n",
"
\n",
" 193 | \n",
" 3 | \n",
" 155 | \n",
"
\n",
" 114 | \n",
" asio | \n",
" 153 | \n",
"
\n",
" 317 | \n",
" associations | \n",
" 146 | \n",
"
\n",
" 268 | \n",
" cp | \n",
" 131 | \n",
"
\n",
" 339 | \n",
" 4 | \n",
" 117 | \n",
"
\n",
" 34 | \n",
" council | \n",
" 114 | \n",
"
\n",
" 39 | \n",
" union | \n",
" 107 | \n",
"
\n",
" 47 | \n",
" association | \n",
" 106 | \n",
"
\n",
" 43 | \n",
" general | \n",
" 104 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"series_details.display_word_counts(title_text)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" ngram | \n",
" count | \n",
"
\n",
" \n",
" 0 | \n",
" communist party | \n",
" 1,019 | \n",
"
\n",
" 1 | \n",
" of australia | \n",
" 1,016 | \n",
"
\n",
" 2 | \n",
" party of | \n",
" 973 | \n",
"
\n",
" 3 | \n",
" volume 1 | \n",
" 398 | \n",
"
\n",
" 4 | \n",
" cpa communist | \n",
" 250 | \n",
"
\n",
" 5 | \n",
" volume 2 | \n",
" 244 | \n",
"
\n",
" 6 | \n",
" interest in | \n",
" 211 | \n",
"
\n",
" 7 | \n",
" new south | \n",
" 182 | \n",
"
\n",
" 8 | \n",
" south wales | \n",
" 182 | \n",
"
\n",
" 9 | \n",
" of a | \n",
" 160 | \n",
"
\n",
" 10 | \n",
" a communist | \n",
" 145 | \n",
"
\n",
" 11 | \n",
" volume 3 | \n",
" 141 | \n",
"
\n",
" 12 | \n",
" cp of | \n",
" 131 | \n",
"
\n",
" 13 | \n",
" branch communist | \n",
" 118 | \n",
"
\n",
" 14 | \n",
" volume 4 | \n",
" 109 | \n",
"
\n",
" 15 | \n",
" associations individual | \n",
" 95 | \n",
"
\n",
" 16 | \n",
" australia interest | \n",
" 88 | \n",
"
\n",
" 17 | \n",
" south australia | \n",
" 87 | \n",
"
\n",
" 18 | \n",
" australia new | \n",
" 83 | \n",
"
\n",
" 19 | \n",
" australia queensland | \n",
" 79 | \n",
"
\n",
" 20 | \n",
" australia nsw | \n",
" 77 | \n",
"
\n",
" 21 | \n",
" of the | \n",
" 76 | \n",
"
\n",
" 22 | \n",
" western australia | \n",
" 76 | \n",
"
\n",
" 23 | \n",
" australia victoria | \n",
" 72 | \n",
"
\n",
" 24 | \n",
" nsw volume | \n",
" 71 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Change ngram_count for larger ngrams (trigrams etc)\n",
"ngram_count = 2\n",
"series_details.display_top_ngrams(title_text, ngram_count)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}