{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Harvest indexes\n",
    "\n",
    "This notebook harvests data from all of [NSW State Archives online indexes](https://www.records.nsw.gov.au/archives/collections-and-research/guides-and-indexes/indexes-a-z), saving the data as a collection of easily downloadable CSV files.\n",
    "\n",
    "Before you harvest the data, you need to [get the details of all the indexes](get-list-of-indexes.ipynb).\n",
    "\n",
    "If you just want the data, my latest harvest of the indexes is [available from this repository](https://github.com/wragge/srnsw-indexes).\n",
    "\n",
    "If you'd like to explore the harvested data, try the [Index Explorer](/apps/index-explorer.ipynb)!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import what we need"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import re\n",
    "from tqdm import tqdm_notebook\n",
    "import pandas as pd\n",
    "from slugify import slugify\n",
    "from urllib.parse import urlparse, parse_qs, urljoin\n",
    "import time\n",
    "import string\n",
    "import os\n",
    "from requests.adapters import HTTPAdapter\n",
    "from requests.packages.urllib3.util.retry import Retry\n",
    "\n",
    "# By using requests_cache we don't have to redownload pages if a harvest fails part way through\n",
    "import requests_cache\n",
    "\n",
    "s = requests_cache.CachedSession()\n",
    "retries = Retry(total=10, backoff_factor=1, status_forcelist=[ 502, 503, 504, 524 ])\n",
    "s.mount('http://', HTTPAdapter(max_retries=retries))\n",
    "s.mount('https://', HTTPAdapter(max_retries=retries))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define some functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_values(row, tag):\n",
    "    '''\n",
    "    Extracts values from the row of an index.\n",
    "    Parameters:\n",
    "        row - a row of an html table\n",
    "        tag - html table cell tag, either td or th\n",
    "    Returns:\n",
    "        A list of values\n",
    "    '''\n",
    "    values = []\n",
    "    \n",
    "    # Look though all the cells with the specified tag\n",
    "    for cell in row.find_all(tag):\n",
    "        \n",
    "        # Get the value of a cell, replacing any non-breaking spaces\n",
    "        try:\n",
    "            value = cell.string.replace(u'\\xa0', '')\n",
    "        except AttributeError:\n",
    "            value = cell.string\n",
    "            \n",
    "        # Add the value to the list\n",
    "        values.append(value)\n",
    "    return values\n",
    "\n",
    "\n",
    "def get_urls(row, columns):\n",
    "    '''\n",
    "    Checks to see if any of the values in a row have links.\n",
    "    If they do, it adds the urls to a new column.\n",
    "    '''\n",
    "    urls = {}\n",
    "    \n",
    "    # Loop through cells\n",
    "    for i, cell in enumerate(row.find_all('td')):\n",
    "        \n",
    "        # Look for a link\n",
    "        link = cell.find('a')\n",
    "        \n",
    "        # If there's a link and the corresponding column has a name\n",
    "        if link and columns[i] is not None:\n",
    "            \n",
    "            # Get the path\n",
    "            path = link['href']\n",
    "            \n",
    "            # Check that it's a full path\n",
    "            if path != '/index_image/':\n",
    "                \n",
    "                # Turn the relative path into and absolute url\n",
    "                url = urljoin('https://www.records.nsw.gov.au', path)\n",
    "                \n",
    "                # Create a new column name by appending 'url' to the current column\n",
    "                column = '{} url'.format(columns[i])\n",
    "                \n",
    "                # Add to the dictionary\n",
    "                urls[column] = url\n",
    "    return urls\n",
    "            \n",
    "\n",
    "\n",
    "def get_total_pages(url):\n",
    "    '''\n",
    "    Get the total number of pages in an index.\n",
    "    '''\n",
    "    # Get the first page\n",
    "    response = s.get(url, timeout=60)\n",
    "    \n",
    "    # Soupify\n",
    "    soup = BeautifulSoup(response.text, 'lxml')\n",
    "    \n",
    "    # Find the link to the last page\n",
    "    last_link = soup.find('a', title='Go to last page')\n",
    "    \n",
    "    # Get the last page number from the link\n",
    "    last_page = int(re.search('page=(\\d+)', last_link['href']).group(1))\n",
    "    return last_page\n",
    "\n",
    "\n",
    "def harvest_index(index, start_page=0):\n",
    "    '''\n",
    "    Harvest all the available data from an online index.\n",
    "    Parameters:\n",
    "        index - a dictionary with 'title' and 'url' parameters\n",
    "    Returns:\n",
    "        A list of harvested rows, the actual fields depends on the index being harvested.\n",
    "    '''\n",
    "    title = index['title']\n",
    "    url = index['url']\n",
    "    rows = []\n",
    "    \n",
    "    # Get the total number of pages in the index\n",
    "    total_pages = get_total_pages(url)\n",
    "    \n",
    "    # Split the index search url into the base url and query parameters\n",
    "    base_url, query = url.split('?')\n",
    "    \n",
    "    # Parse the query parameters into dictionary form\n",
    "    params = parse_qs(query, keep_blank_values=True)\n",
    "    \n",
    "    # Set the start page\n",
    "    page = start_page\n",
    "    \n",
    "    # Column headings\n",
    "    columns = None\n",
    "    \n",
    "    # Loop through all the pages in the index\n",
    "    with tqdm_notebook(total=(total_pages - start_page), leave=False, desc='Pages:') as pbar:\n",
    "        while page <= total_pages:\n",
    "            \n",
    "            # Set the page parameter to the current page\n",
    "            params['page'] = page\n",
    "            \n",
    "            # Get a page of index search results & soupify\n",
    "            response = s.get(base_url, params=params, timeout=120)\n",
    "            soup = BeautifulSoup(response.text, 'lxml')\n",
    "            \n",
    "            # If the columns list is empty, then we'll get the column headings from the first row\n",
    "            if columns is None:\n",
    "                header = soup.find('thead').find('tr')\n",
    "                columns = get_values(header, 'th')\n",
    "                \n",
    "            # Get the rows\n",
    "            table = soup.find('tbody')\n",
    "            \n",
    "            try:\n",
    "                # Loop through all the rows\n",
    "                for row in table.find_all('tr', recursive=False):\n",
    "                    \n",
    "                    # Get the values of a row and zip them up with the column headers to make a dictionary\n",
    "                    values = dict(zip(columns, get_values(row, 'td')))\n",
    "                    \n",
    "                    # Get any urls as new columns and add them to the values dictionary\n",
    "                    values.update(get_urls(row, columns))\n",
    "                    \n",
    "                    # Save the row values\n",
    "                    rows.append(values)\n",
    "                    \n",
    "            # Sometimes this fails with an Attribute Error\n",
    "            # I assume we're getting a page with no results, but no explicit error\n",
    "            # In this case we'll try and trigger the retry mechanism\n",
    "            except AttributeError:\n",
    "                raise requests.HTTPError\n",
    "            else:\n",
    "                \n",
    "                # Rest for a bit\n",
    "                time.sleep(1)\n",
    "                \n",
    "                # Increment the page\n",
    "                page += 1\n",
    "                pbar.update(1)\n",
    "    return rows\n",
    "\n",
    "\n",
    "def save_csv(rows, title):\n",
    "    '''\n",
    "    Saves harvested index as a CSV formatted file.\n",
    "    \n",
    "    Parameters:\n",
    "    \n",
    "        rows – the data\n",
    "        title – the name of the index\n",
    "    '''\n",
    "    \n",
    "    # Convert the data into a DataFrame\n",
    "    df = pd.DataFrame(rows)\n",
    "    \n",
    "    # Remove empty rows and columns\n",
    "    df.dropna(axis=1, how='all', inplace=True)\n",
    "    df.dropna(axis=0, how='all', inplace=True)\n",
    "    \n",
    "    # Save as a CSV, using the slugified index title as a file name\n",
    "    df.to_csv(os.path.join('csv', '{}.csv'.format(slugify(title))), index=False)\n",
    "\n",
    "\n",
    "def harvest_all_indexes(start=0):\n",
    "    '''\n",
    "    Works through the list of indexes, harvesting the data for each index and saving it as a CSV file.\n",
    "    '''\n",
    "    \n",
    "    # Open up the CSV file containing the index details\n",
    "    indexes = pd.read_csv('indexes.csv')\n",
    "    \n",
    "    # Loop through the list of indexes \n",
    "    for index in tqdm_notebook(indexes.to_dict('records')[start:], desc='Indexes:'):\n",
    "        \n",
    "        # GET ALL THE DATA!\n",
    "        rows = harvest_index(index)\n",
    "        \n",
    "        # Save the harvested data as a CSV, using the index name as a file title\n",
    "        save_csv(rows, index['title'])\n",
    "        \n",
    "        # Clear the requests cache after each index\n",
    "        s.cache.clear()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get all the data!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "harvest_all_indexes()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Harvest a single index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, description='Pages:', max=9584, style=ProgressStyle(description_width='ini…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "index = {'title': 'Assisted Immigrants', 'url': 'https://www.records.nsw.gov.au/searchhits_nocopy?id=9&Surname=%25&Firstname=&Ship&Year=&Arriving&Remarks='}\n",
    "rows = harvest_index(index)\n",
    "df = pd.DataFrame(rows)\n",
    "save_csv(rows, index['title'])\n",
    "s.cache.clear()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----\n",
    "\n",
    "Created by [Tim Sherratt](https://timsherratt.org/).\n",
    "\n",
    "Part of the [GLAM Workbench](https://glam-workbench.github.io/) project."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}