{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"series = 'A6119'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import os\n",
"import pandas as pd\n",
"import series_details\n",
"import plotly.offline as py\n",
"py.init_notebook_mode()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"
National Archives of Australia: Series A6119
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Personal files, alpha-numeric series
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Total items | 6,741 |
---|
Access status | |
---|
Open with exception | 6,314 (93.67%) |
Not yet examined | 363 (5.38%) |
Open | 43 (0.64%) |
Closed | 20 (0.30%) |
Withheld pending agency advice | 1 (0.01%) |
Number of items digitised | 2,320 (34.42%) |
---|
Number of pages digitised | 258,547 |
---|
Date of earliest content | 1852 |
---|
Date of latest content | 2009 |
---|
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"series_details.display_summary(series, df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Content preview"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" identifier | \n",
" series | \n",
" control_symbol | \n",
" title | \n",
" contents_dates | \n",
" start_date | \n",
" end_date | \n",
" access_status | \n",
" location | \n",
" digitised_status | \n",
" digitised_pages | \n",
"
\n",
" \n",
" 0 | \n",
" 240734 | \n",
" A6119 | \n",
" 13 ATTACHMENT | \n",
" BURCHETT, Wilfred Graham - Monitoring reports ex UNC. | \n",
" 1951 - 1953 | \n",
" 1951-01-01 00:00:00 | \n",
" 1953-01-01 00:00:00 | \n",
" Open | \n",
" Canberra | \n",
" True | \n",
" 209 | \n",
"
\n",
" 1 | \n",
" 273730 | \n",
" A6119 | \n",
" 42 | \n",
" THROSSELL, Katharine [Katherine Susannah] nee PRICHARD [PRITCHARD] - Volume 1 | \n",
" 1919 - 1940 | \n",
" 1919-01-01 00:00:00 | \n",
" 1940-01-01 00:00:00 | \n",
" Open with exception | \n",
" Canberra | \n",
" True | \n",
" 127 | \n",
"
\n",
" 2 | \n",
" 276650 | \n",
" A6119 | \n",
" 44 PART 1 | \n",
" THROSSELL, Katharine [Katherine] Susannah nee PRICHARD [PRITCHARD] - Volume 3 Part 1 (folios 1 to 20) | \n",
" 1945 - 1948 | \n",
" 1945-01-01 00:00:00 | \n",
" 1948-01-01 00:00:00 | \n",
" Open with exception | \n",
" Canberra | \n",
" True | \n",
" 27 | \n",
"
\n",
" 3 | \n",
" 279177 | \n",
" A6119 | \n",
" 43 | \n",
" THROSSELL, Katharine [Katherine] Susannah nee PRICHARD [PRITCHARD] - Volume 2 | \n",
" 1941 - 1952 | \n",
" 1941-01-01 00:00:00 | \n",
" 1952-01-01 00:00:00 | \n",
" Open with exception | \n",
" Canberra | \n",
" True | \n",
" 170 | \n",
"
\n",
" 4 | \n",
" 332779 | \n",
" A6119 | \n",
" 260 | \n",
" TERKES Ferdo Formerlu Pijevic Joso | \n",
" 1954 - 1955 | \n",
" 1954-01-01 00:00:00 | \n",
" 1955-01-01 00:00:00 | \n",
" Open with exception | \n",
" Canberra | \n",
" False | \n",
" 0 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Change the number_of_rows value to see more\n",
"number_of_rows = 5\n",
"\n",
"# Display dataframe \n",
"df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector=\"th\", props=[(\"text-align\", \"center\")]),\n",
" dict(selector='.row_heading, .blank', props=[('display', 'none')])])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Plot content dates"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"data": [
{
"name": "Digitised",
"type": "bar",
"x": [
1852,
1853,
1854,
1855,
1856,
1857,
1858,
1859,
1860,
1861,
1862,
1863,
1864,
1865,
1866,
1867,
1868,
1869,
1870,
1871,
1872,
1873,
1874,
1875,
1876,
1877,
1878,
1879,
1880,
1881,
1882,
1883,
1884,
1885,
1886,
1887,
1888,
1889,
1890,
1891,
1892,
1893,
1894,
1895,
1896,
1897,
1898,
1899,
1900,
1901,
1902,
1903,
1904,
1905,
1906,
1907,
1908,
1909,
1910,
1911,
1912,
1913,
1914,
1915,
1916,
1917,
1918,
1919,
1920,
1921,
1922,
1923,
1924,
1925,
1926,
1927,
1928,
1929,
1930,
1931,
1932,
1933,
1934,
1935,
1936,
1937,
1938,
1939,
1940,
1941,
1942,
1943,
1944,
1945,
1946,
1947,
1948,
1949,
1950,
1951,
1952,
1953,
1954,
1955,
1956,
1957,
1958,
1959,
1960,
1961,
1962,
1963,
1964,
1965,
1966,
1967,
1968,
1969,
1970,
1971,
1972,
1973,
1974,
1975,
1976,
1977,
1978,
1979,
1980,
1981,
1982,
1983,
1984,
1985,
1986,
1987,
1988,
1989,
1990,
1991,
1992,
1993
],
"y": [
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
2,
3,
3,
4,
5,
5,
8,
9,
78,
80,
81,
85,
92,
94,
103,
117,
122,
126,
132,
145,
152,
175,
213,
233,
238,
251,
259,
269,
281,
307,
398,
475,
524,
549,
598,
630,
675,
688,
653,
670,
679,
692,
694,
676,
672,
586,
597,
588,
585,
574,
576,
603,
569,
581,
565,
470,
421,
386,
340,
317,
303,
267,
227,
199,
171,
139,
85,
72,
49,
41,
25,
21,
5,
6,
1,
1
]
},
{
"name": "Not digitised",
"type": "bar",
"x": [
1923,
1924,
1925,
1926,
1927,
1928,
1929,
1930,
1931,
1932,
1933,
1934,
1935,
1936,
1937,
1938,
1939,
1940,
1941,
1942,
1943,
1944,
1945,
1946,
1947,
1948,
1949,
1950,
1951,
1952,
1953,
1954,
1955,
1956,
1957,
1958,
1959,
1960,
1961,
1962,
1963,
1964,
1965,
1966,
1967,
1968,
1969,
1970,
1971,
1972,
1973,
1974,
1975,
1976,
1977,
1978,
1979,
1980,
1981,
1982,
1983,
1984,
1985,
1986,
1987,
1988,
1989,
1990,
1991,
1992,
1993,
1994,
1995,
1996,
1997,
1998,
1999,
2000,
2001,
2002,
2003,
2004,
2005,
2006,
2007,
2008,
2009
],
"y": [
1,
5,
6,
104,
109,
112,
118,
124,
124,
130,
144,
152,
160,
174,
193,
205,
237,
308,
343,
372,
406,
433,
451,
477,
526,
668,
838,
916,
982,
1062,
1142,
1237,
1245,
1243,
1271,
1266,
1265,
1247,
1231,
1190,
1034,
988,
921,
839,
851,
836,
874,
889,
833,
774,
608,
515,
458,
414,
364,
327,
298,
257,
227,
219,
181,
125,
104,
76,
64,
52,
36,
22,
13,
4,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
1,
1,
1,
1,
1
]
}
],
"layout": {
"barmode": "stack",
"title": "Content dates",
"xaxis": {
"title": "Year"
},
"yaxis": {
"title": "Number of items"
}
}
},
"text/html": [
""
],
"text/vnd.plotly.v1+html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fig = series_details.plot_dates(df)\n",
"py.iplot(fig, filename='series-dates-bar')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## View word frequencies"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# Combine all of the file titles into a single string\n",
"title_text = a = df['title'].str.lower().str.cat(sep=' ')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" word | \n",
" count | \n",
"
\n",
" \n",
" 14 | \n",
" volume | \n",
" 4,768 | \n",
"
\n",
" 15 | \n",
" 1 | \n",
" 1,590 | \n",
"
\n",
" 20 | \n",
" 2 | \n",
" 937 | \n",
"
\n",
" 49 | \n",
" aka | \n",
" 752 | \n",
"
\n",
" 32 | \n",
" john | \n",
" 729 | \n",
"
\n",
" 16 | \n",
" 3 | \n",
" 532 | \n",
"
\n",
" 112 | \n",
" 4 | \n",
" 376 | \n",
"
\n",
" 259 | \n",
" papers | \n",
" 350 | \n",
"
\n",
" 168 | \n",
" william | \n",
" 343 | \n",
"
\n",
" 258 | \n",
" miscellaneous | \n",
" 327 | \n",
"
\n",
" 493 | \n",
" 5 | \n",
" 278 | \n",
"
\n",
" 137 | \n",
" james | \n",
" 277 | \n",
"
\n",
" 30 | \n",
" george | \n",
" 275 | \n",
"
\n",
" 11 | \n",
" nee | \n",
" 228 | \n",
"
\n",
" 705 | \n",
" aarons | \n",
" 219 | \n",
"
\n",
" 488 | \n",
" 6 | \n",
" 214 | \n",
"
\n",
" 233 | \n",
" robert | \n",
" 206 | \n",
"
\n",
" 129 | \n",
" francis | \n",
" 182 | \n",
"
\n",
" 236 | \n",
" alexander | \n",
" 170 | \n",
"
\n",
" 487 | \n",
" 7 | \n",
" 164 | \n",
"
\n",
" 288 | \n",
" joseph | \n",
" 159 | \n",
"
\n",
" 171 | \n",
" david | \n",
" 158 | \n",
"
\n",
" 385 | \n",
" michael | \n",
" 154 | \n",
"
\n",
" 130 | \n",
" edward | \n",
" 151 | \n",
"
\n",
" 152 | \n",
" charles | \n",
" 139 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"series_details.display_word_counts(title_text)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" \n",
" \n",
" | \n",
" ngram | \n",
" count | \n",
"
\n",
" \n",
" 0 | \n",
" volume 1 | \n",
" 1,483 | \n",
"
\n",
" 1 | \n",
" volume 2 | \n",
" 848 | \n",
"
\n",
" 2 | \n",
" volume 3 | \n",
" 496 | \n",
"
\n",
" 3 | \n",
" volume 4 | \n",
" 354 | \n",
"
\n",
" 4 | \n",
" miscellaneous papers | \n",
" 320 | \n",
"
\n",
" 5 | \n",
" volume 5 | \n",
" 265 | \n",
"
\n",
" 6 | \n",
" john volume | \n",
" 236 | \n",
"
\n",
" 7 | \n",
" volume 6 | \n",
" 200 | \n",
"
\n",
" 8 | \n",
" volume 7 | \n",
" 157 | \n",
"
\n",
" 9 | \n",
" volume 8 | \n",
" 122 | \n",
"
\n",
" 10 | \n",
" volume 9 | \n",
" 102 | \n",
"
\n",
" 11 | \n",
" asio file | \n",
" 96 | \n",
"
\n",
" 12 | \n",
" laurence volume | \n",
" 90 | \n",
"
\n",
" 13 | \n",
" francis volume | \n",
" 89 | \n",
"
\n",
" 14 | \n",
" volume 10 | \n",
" 88 | \n",
"
\n",
" 15 | \n",
" aarons laurence | \n",
" 87 | \n",
"
\n",
" 16 | \n",
" george volume | \n",
" 79 | \n",
"
\n",
" 17 | \n",
" albert volume | \n",
" 77 | \n",
"
\n",
" 18 | \n",
" william volume | \n",
" 76 | \n",
"
\n",
" 19 | \n",
" volume 11 | \n",
" 72 | \n",
"
\n",
" 20 | \n",
" volume 12 | \n",
" 65 | \n",
"
\n",
" 21 | \n",
" david volume | \n",
" 62 | \n",
"
\n",
" 22 | \n",
" james volume | \n",
" 61 | \n",
"
\n",
" 23 | \n",
" robert volume | \n",
" 57 | \n",
"
\n",
" 24 | \n",
" wilton john | \n",
" 55 | \n",
"
\n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Change ngram_count for larger ngrams (trigrams etc)\n",
"ngram_count = 2\n",
"series_details.display_top_ngrams(title_text, ngram_count)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}