{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get details of indexes\n",
    "\n",
    "This notebook scrapes details of available indexes from the [NSW State Archives A to Z list of online indexes](https://www.records.nsw.gov.au/archives/collections-and-research/guides-and-indexes/indexes-a-z). It saves the results as a CSV formatted file.\n",
    "\n",
    "Once you've harvested the index details, you can use them to [harvest the content](harvest-indexes.ipynb) of all the individual indexes.\n",
    "\n",
    "Here's the [indexes.csv](indexes.csv) I harvested in July 2019.\n",
    "\n",
    "The fields in the CSV file are:\n",
    "\n",
    "* `id` – numeric index identifier\n",
    "* `title` – index title (this is taken from the index search page, some indexes have different titles in the category listings)\n",
    "* `url` – a search url that returns all the results in the index\n",
    "* `status` – Not digitised / Fully digitised\n",
    "* `more_info_url` – link with more information about the index"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import what we need"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import re\n",
    "from urllib.parse import urljoin\n",
    "from tqdm import tqdm_notebook\n",
    "import pandas as pd\n",
    "from requests.adapters import HTTPAdapter\n",
    "from requests.packages.urllib3.util.retry import Retry\n",
    "\n",
    "s = requests.Session()\n",
    "retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])\n",
    "s.mount('http://', HTTPAdapter(max_retries=retries))\n",
    "s.mount('https://', HTTPAdapter(max_retries=retries))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define our functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_category(category):\n",
    "    '''\n",
    "    Process a category page, scraping the links to indexes & extracting basic data.\n",
    "    \n",
    "    Parameters:\n",
    "        category - the index category to process\n",
    "        \n",
    "    Returns:\n",
    "        A list of indexes.\n",
    "    '''\n",
    "    indexes = []\n",
    "    \n",
    "    # Construct a url to the category\n",
    "    url = 'https://www.records.nsw.gov.au' + category\n",
    "    \n",
    "    # Get the category page and soupify\n",
    "    response = s.get(url)\n",
    "    soup = BeautifulSoup(response.text)\n",
    "    \n",
    "    # Find the rows containing index info and loop through them\n",
    "    for div in soup.find_all('div', class_='container'):\n",
    "        \n",
    "        # Get the index search link\n",
    "        index = div.find('a', class_='form-submit')\n",
    "        \n",
    "        # Try to extract the numeric index id from the link\n",
    "        try:\n",
    "            index_id = re.search('\\?id=(\\d+)', index['href']).group(1)\n",
    "        except (AttributeError, TypeError):\n",
    "            pass\n",
    "        else:\n",
    "            # If we find an id, then grab some other data\n",
    "            # Get the digitisation status\n",
    "            status = div.find('a', href=re.compile(r'record-status')).string\n",
    "            \n",
    "            # Get the link to more information\n",
    "            more_info = div.find('a', string=re.compile(r'More about the Index'))\n",
    "            \n",
    "            # If there's no more info link, just use the category page\n",
    "            if more_info is None:\n",
    "                more_info_url = url\n",
    "                \n",
    "            # If there is a more info link, turn it into a url\n",
    "            else:\n",
    "                more_info_url = urljoin('https://www.records.nsw.gov.au', more_info['href'])\n",
    "                \n",
    "            # Add this index to the list\n",
    "            indexes.append({'id': index_id, 'status': status, 'more_info_url': more_info_url})\n",
    "    return indexes\n",
    "\n",
    "def get_indexes():\n",
    "    '''\n",
    "    Process each of the categories on the A-Z page, scraping the links & extracting the index data.\n",
    "    \n",
    "    Returns:\n",
    "        A list of indexes.\n",
    "    '''\n",
    "    indexes = []\n",
    "    \n",
    "    # Some indexes appear in more than one category, so we'll keep track of what we've seen.\n",
    "    seen = []\n",
    "    \n",
    "    # Get the A-Z page & turn it into soup\n",
    "    response = s.get('https://www.records.nsw.gov.au/archives/collections-and-research/guides-and-indexes/indexes-a-z')\n",
    "    soup = BeautifulSoup(response.text)\n",
    "    \n",
    "    # Get all the links that go to an index category\n",
    "    links = soup.find_all('a', href=re.compile('/archives/collections-and-research/guides-and-indexes/[a-z\\-]+/indexes'))\n",
    "    \n",
    "    # Loop through the links\n",
    "    for link in tqdm_notebook(links, desc='Links:'):\n",
    "        \n",
    "        # If we haven't seen this url before, we'll add it to the seen list\n",
    "        if link['href'] not in seen:\n",
    "            seen.append(link['href'])\n",
    "            \n",
    "            # Get all the indexes from the category link\n",
    "            indexes += process_category(link['href'])\n",
    "    \n",
    "    # Make sure we have no duplicates\n",
    "    indexes = [i for n, i in enumerate(indexes) if i not in indexes[n + 1:]]\n",
    "    return indexes\n",
    "\n",
    "\n",
    "def make_index_list():\n",
    "    '''\n",
    "    Get the title and search url for each index.\n",
    "    \n",
    "    Returns:\n",
    "        A list of all indexes with the following columns\n",
    "            - id\n",
    "            - title\n",
    "            - url (search url)\n",
    "            - status (is it digitised?)\n",
    "            - more_info_url (link to more info)\n",
    "    '''  \n",
    "    # Get all the indexes from the A-Z & category pages\n",
    "    indexes = get_indexes()\n",
    "    \n",
    "    # Loop through the indexes)\n",
    "    for index in tqdm_notebook(indexes, desc='Indexes:'):\n",
    "        \n",
    "        # What we're doing here is trying to formulate the url we'll need to harvest all the data from an index\n",
    "        # First we get the index page (which includes the search form)\n",
    "        response = s.get('https://www.records.nsw.gov.au/search_form', params={'id': index['id']})\n",
    "        soup = BeautifulSoup(response.text)\n",
    "        \n",
    "        # Get the title of the index\n",
    "        index['title'] = soup.find('h1').string\n",
    "        \n",
    "        # Find the search form\n",
    "        form = soup.find(id='records-online-index-search-form')\n",
    "        \n",
    "        # Get all the input fields from the form\n",
    "        inputs = form.find_all('input')\n",
    "        \n",
    "        # This is the payload that we'll save the form parameters to\n",
    "        data = {}\n",
    "        \n",
    "        # Loop through the input fields\n",
    "        for i, field in enumerate(inputs):\n",
    "        \n",
    "            # To get all the records in an index, we search fpor '%'\n",
    "            # If this is the first field, set its value to %\n",
    "            if i == 0:\n",
    "                data[field['name']] = '%'\n",
    "        \n",
    "            # Otherwise just keep default values\n",
    "            else:\n",
    "                data[field['name']] = field['value']\n",
    "        \n",
    "        # Submit the form data\n",
    "        form_response = s.post('https://www.records.nsw.gov.au/search_form', params={'id': index['id']}, data=data)\n",
    "        \n",
    "        # Save the form submission url\n",
    "        index['url'] = form_response.url\n",
    "    return indexes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Harvest the index details"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Harvest index details\n",
    "indexes = make_index_list()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Convert to a dataframe and save as a CSV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>more_info_url</th>\n",
       "      <th>status</th>\n",
       "      <th>title</th>\n",
       "      <th>url</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>47</td>\n",
       "      <td>https://www.records.nsw.gov.au/archives/collec...</td>\n",
       "      <td>Not digitised</td>\n",
       "      <td>Index on Occupants on Aboriginal Reserves, 187...</td>\n",
       "      <td>https://www.records.nsw.gov.au/searchhits_noco...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>91</td>\n",
       "      <td>https://www.records.nsw.gov.au/archives/collec...</td>\n",
       "      <td>Not digitised</td>\n",
       "      <td>Botanic Gardens and Government Domains Employe...</td>\n",
       "      <td>https://www.records.nsw.gov.au/searchhits_noco...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9</td>\n",
       "      <td>https://www.records.nsw.gov.au/archives/collec...</td>\n",
       "      <td>Fully digitised</td>\n",
       "      <td>Assisted Immigrants</td>\n",
       "      <td>https://www.records.nsw.gov.au/searchhits_noco...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>55</td>\n",
       "      <td>https://www.records.nsw.gov.au/archives/collec...</td>\n",
       "      <td>Not digitised</td>\n",
       "      <td>Index to Miscellaneous Immigrants</td>\n",
       "      <td>https://www.records.nsw.gov.au/searchhits_noco...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>43</td>\n",
       "      <td>https://www.records.nsw.gov.au/archives/collec...</td>\n",
       "      <td>Not digitised</td>\n",
       "      <td>Index to the Unassisted Arrivals NSW 1842-1855</td>\n",
       "      <td>https://www.records.nsw.gov.au/searchhits_noco...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                      more_info_url           status  \\\n",
       "0  47  https://www.records.nsw.gov.au/archives/collec...    Not digitised   \n",
       "1  91  https://www.records.nsw.gov.au/archives/collec...    Not digitised   \n",
       "2   9  https://www.records.nsw.gov.au/archives/collec...  Fully digitised   \n",
       "3  55  https://www.records.nsw.gov.au/archives/collec...    Not digitised   \n",
       "4  43  https://www.records.nsw.gov.au/archives/collec...    Not digitised   \n",
       "\n",
       "                                               title  \\\n",
       "0  Index on Occupants on Aboriginal Reserves, 187...   \n",
       "1  Botanic Gardens and Government Domains Employe...   \n",
       "2                                Assisted Immigrants   \n",
       "3                  Index to Miscellaneous Immigrants   \n",
       "4     Index to the Unassisted Arrivals NSW 1842-1855   \n",
       "\n",
       "                                                 url  \n",
       "0  https://www.records.nsw.gov.au/searchhits_noco...  \n",
       "1  https://www.records.nsw.gov.au/searchhits_noco...  \n",
       "2  https://www.records.nsw.gov.au/searchhits_noco...  \n",
       "3  https://www.records.nsw.gov.au/searchhits_noco...  \n",
       "4  https://www.records.nsw.gov.au/searchhits_noco...  "
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Convert to a Pandas dataframe\n",
    "df = pd.DataFrame(indexes)\n",
    "\n",
    "# Peek inside\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Not digitised      56\n",
       "Fully digitised     8\n",
       "Name: status, dtype: int64"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['status'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save as a CSV file\n",
    "df.to_csv('indexes.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----\n",
    "\n",
    "Created by [Tim Sherratt](https://timsherratt.org/).\n",
    "\n",
    "Part of the [GLAM Workbench](https://glam-workbench.github.io/) project."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}