{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Harvest indexes\n", "\n", "This notebook harvests data from all of [NSW State Archives online indexes](https://www.records.nsw.gov.au/archives/collections-and-research/guides-and-indexes/indexes-a-z), saving the data as a collection of easily downloadable CSV files.\n", "\n", "Before you harvest the data, you need to [get the details of all the indexes](get-list-of-indexes.ipynb).\n", "\n", "If you just want the data, my latest harvest of the indexes is [available from this repository](https://github.com/wragge/srnsw-indexes).\n", "\n", "If you'd like to explore the harvested data, try the [Index Explorer](/apps/index-explorer.ipynb)!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Import what we need" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import re\n", "from tqdm import tqdm_notebook\n", "import pandas as pd\n", "from slugify import slugify\n", "from urllib.parse import urlparse, parse_qs, urljoin\n", "import time\n", "import string\n", "import os\n", "from requests.adapters import HTTPAdapter\n", "from requests.packages.urllib3.util.retry import Retry\n", "\n", "# By using requests_cache we don't have to redownload pages if a harvest fails part way through\n", "import requests_cache\n", "\n", "s = requests_cache.CachedSession()\n", "retries = Retry(total=10, backoff_factor=1, status_forcelist=[ 502, 503, 504, 524 ])\n", "s.mount('http://', HTTPAdapter(max_retries=retries))\n", "s.mount('https://', HTTPAdapter(max_retries=retries))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Define some functions" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [], "source": [ "def get_values(row, tag):\n", " '''\n", " Extracts values from the row of an index.\n", " Parameters:\n", " row - a row of an html table\n", " tag - html table cell tag, either td or th\n", " Returns:\n", " A list of values\n", " '''\n", " values = []\n", " \n", " # Look though all the cells with the specified tag\n", " for cell in row.find_all(tag):\n", " \n", " # Get the value of a cell, replacing any non-breaking spaces\n", " try:\n", " value = cell.string.replace(u'\\xa0', '')\n", " except AttributeError:\n", " value = cell.string\n", " \n", " # Add the value to the list\n", " values.append(value)\n", " return values\n", "\n", "\n", "def get_urls(row, columns):\n", " '''\n", " Checks to see if any of the values in a row have links.\n", " If they do, it adds the urls to a new column.\n", " '''\n", " urls = {}\n", " \n", " # Loop through cells\n", " for i, cell in enumerate(row.find_all('td')):\n", " \n", " # Look for a link\n", " link = cell.find('a')\n", " \n", " # If there's a link and the corresponding column has a name\n", " if link and columns[i] is not None:\n", " \n", " # Get the path\n", " path = link['href']\n", " \n", " # Check that it's a full path\n", " if path != '/index_image/':\n", " \n", " # Turn the relative path into and absolute url\n", " url = urljoin('https://www.records.nsw.gov.au', path)\n", " \n", " # Create a new column name by appending 'url' to the current column\n", " column = '{} url'.format(columns[i])\n", " \n", " # Add to the dictionary\n", " urls[column] = url\n", " return urls\n", " \n", "\n", "\n", "def get_total_pages(url):\n", " '''\n", " Get the total number of pages in an index.\n", " '''\n", " # Get the first page\n", " response = s.get(url, timeout=60)\n", " \n", " # Soupify\n", " soup = BeautifulSoup(response.text, 'lxml')\n", " \n", " # Find the link to the last page\n", " last_link = soup.find('a', title='Go to last page')\n", " \n", " # Get the last page number from the link\n", " last_page = int(re.search('page=(\\d+)', last_link['href']).group(1))\n", " return last_page\n", "\n", "\n", "def harvest_index(index, start_page=0):\n", " '''\n", " Harvest all the available data from an online index.\n", " Parameters:\n", " index - a dictionary with 'title' and 'url' parameters\n", " Returns:\n", " A list of harvested rows, the actual fields depends on the index being harvested.\n", " '''\n", " title = index['title']\n", " url = index['url']\n", " rows = []\n", " \n", " # Get the total number of pages in the index\n", " total_pages = get_total_pages(url)\n", " \n", " # Split the index search url into the base url and query parameters\n", " base_url, query = url.split('?')\n", " \n", " # Parse the query parameters into dictionary form\n", " params = parse_qs(query, keep_blank_values=True)\n", " \n", " # Set the start page\n", " page = start_page\n", " \n", " # Column headings\n", " columns = None\n", " \n", " # Loop through all the pages in the index\n", " with tqdm_notebook(total=(total_pages - start_page), leave=False, desc='Pages:') as pbar:\n", " while page <= total_pages:\n", " \n", " # Set the page parameter to the current page\n", " params['page'] = page\n", " \n", " # Get a page of index search results & soupify\n", " response = s.get(base_url, params=params, timeout=120)\n", " soup = BeautifulSoup(response.text, 'lxml')\n", " \n", " # If the columns list is empty, then we'll get the column headings from the first row\n", " if columns is None:\n", " header = soup.find('thead').find('tr')\n", " columns = get_values(header, 'th')\n", " \n", " # Get the rows\n", " table = soup.find('tbody')\n", " \n", " try:\n", " # Loop through all the rows\n", " for row in table.find_all('tr', recursive=False):\n", " \n", " # Get the values of a row and zip them up with the column headers to make a dictionary\n", " values = dict(zip(columns, get_values(row, 'td')))\n", " \n", " # Get any urls as new columns and add them to the values dictionary\n", " values.update(get_urls(row, columns))\n", " \n", " # Save the row values\n", " rows.append(values)\n", " \n", " # Sometimes this fails with an Attribute Error\n", " # I assume we're getting a page with no results, but no explicit error\n", " # In this case we'll try and trigger the retry mechanism\n", " except AttributeError:\n", " raise requests.HTTPError\n", " else:\n", " \n", " # Rest for a bit\n", " time.sleep(1)\n", " \n", " # Increment the page\n", " page += 1\n", " pbar.update(1)\n", " return rows\n", "\n", "\n", "def save_csv(rows, title):\n", " '''\n", " Saves harvested index as a CSV formatted file.\n", " \n", " Parameters:\n", " \n", " rows – the data\n", " title – the name of the index\n", " '''\n", " \n", " # Convert the data into a DataFrame\n", " df = pd.DataFrame(rows)\n", " \n", " # Remove empty rows and columns\n", " df.dropna(axis=1, how='all', inplace=True)\n", " df.dropna(axis=0, how='all', inplace=True)\n", " \n", " # Save as a CSV, using the slugified index title as a file name\n", " df.to_csv(os.path.join('csv', '{}.csv'.format(slugify(title))), index=False)\n", "\n", "\n", "def harvest_all_indexes(start=0):\n", " '''\n", " Works through the list of indexes, harvesting the data for each index and saving it as a CSV file.\n", " '''\n", " \n", " # Open up the CSV file containing the index details\n", " indexes = pd.read_csv('indexes.csv')\n", " \n", " # Loop through the list of indexes \n", " for index in tqdm_notebook(indexes.to_dict('records')[start:], desc='Indexes:'):\n", " \n", " # GET ALL THE DATA!\n", " rows = harvest_index(index)\n", " \n", " # Save the harvested data as a CSV, using the index name as a file title\n", " save_csv(rows, index['title'])\n", " \n", " # Clear the requests cache after each index\n", " s.cache.clear()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get all the data!" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "harvest_all_indexes()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Harvest a single index" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Pages:', max=9584, style=ProgressStyle(description_width='ini…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "index = {'title': 'Assisted Immigrants', 'url': 'https://www.records.nsw.gov.au/searchhits_nocopy?id=9&Surname=%25&Firstname=&Ship&Year=&Arriving&Remarks='}\n", "rows = harvest_index(index)\n", "df = pd.DataFrame(rows)\n", "save_csv(rows, index['title'])\n", "s.cache.clear()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "----\n", "\n", "Created by [Tim Sherratt](https://timsherratt.org/).\n", "\n", "Part of the [GLAM Workbench](https://glam-workbench.github.io/) project." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }