{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Harvesting a series" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "THIS NOTEBOOK IS OUTDATED! \n", "\n", "This notebook has been superseded by brand new, super duper, notebook that explores in much more detail how to harvest items from a search in RecordSearch. I've left it here so as not to break any links, but please don't use it.\n", "
" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import time\n", "import csv\n", "import os\n", "import math\n", "import string\n", "import requests\n", "import pandas as pd\n", "from slugify import slugify\n", "# from PIL import Image, ImageOps\n", "from requests import ConnectionError\n", "from recordsearch_tools.utilities import retry\n", "from recordsearch_tools.client import RSSearchClient, RSSeriesClient\n", "from tinydb import TinyDB, Query\n", "try:\n", " from io import BytesIO\n", "except ImportError:\n", " from StringIO import StringIO\n", "from IPython.display import Image as DImage\n", "from IPython.core.display import HTML\n", "\n", "# Make sure data directory exists\n", "os.makedirs('data/images', exist_ok=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# What series do you want to harvest?\n", "# Insert the series id between the quotes.\n", "series = 'A821'\n", "output_dir = 'data'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## The harvesting code" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class SeriesHarvester():\n", " def __init__(self, series, control=None):\n", " self.series = series\n", " self.control = control\n", " self.total_pages = None\n", " self.pages_complete = 0\n", " self.client = RSSearchClient()\n", " self.prepare_harvest()\n", " self.db = TinyDB('data/db-{}.json'.format(self.series.replace('/', '-')))\n", " self.items = self.db.table('items')\n", " self.images = self.db.table('images')\n", "\n", " def get_total(self):\n", " return self.client.total_results\n", "\n", " def prepare_harvest(self):\n", " if self.control:\n", " self.client.search(series=self.series, control=self.control)\n", " else:\n", " self.client.search(series=self.series)\n", " total_results = self.client.total_results\n", " print('{} items'.format(total_results))\n", " self.total_pages = math.floor(int(total_results) / self.client.results_per_page) + 1\n", " print(self.total_pages)\n", "\n", " @retry(ConnectionError, tries=20, delay=10, backoff=1)\n", " def start_harvest(self, page=None):\n", " Record = Query()\n", " if not page:\n", " page = self.pages_complete + 1\n", " while self.pages_complete < self.total_pages:\n", " if self.control:\n", " response = self.client.search(series=self.series, page=page, control=self.control, sort='9')\n", " else:\n", " response = self.client.search(series=self.series, page=page, sort='9')\n", " for result in response['results']:\n", " self.items.upsert(result, Record.identifier == result['identifier'])\n", " self.pages_complete += 1\n", " page += 1\n", " print('{} pages complete'.format(self.pages_complete))\n", " time.sleep(1)\n", " \n", " @retry(ConnectionError, tries=20, delay=10, backoff=1)\n", " def harvest_images(self):\n", " Record = Query()\n", " items = self.items.search(Record.digitised_status == True)\n", " headers = {'User-Agent': 'Mozilla/5.0'}\n", " for item in items:\n", " directory = os.path.join('data', 'images', '{}/{}-[{}]'.format(self.series.replace('/', '-'), item['control_symbol'].replace('/', '-').replace(' ', '-'), item['identifier']))\n", " if not os.path.exists(directory):\n", " os.makedirs(directory)\n", " for page in range(1, item['digitised_pages'] + 1):\n", " filename = '{}/{}-p{}.jpg'.format(directory, item['identifier'], page)\n", " print('{}, p. {}'.format(item['identifier'], page))\n", " if not os.path.exists(filename):\n", " img_url = 'http://recordsearch.naa.gov.au/NaaMedia/ShowImage.asp?B={}&S={}&T=P'.format(item['identifier'], page)\n", " response = requests.get(img_url, headers=headers, stream=True, verify=False)\n", " response.raise_for_status()\n", " try:\n", " image = Image.open(BytesIO(response.content))\n", " except IOError:\n", " print('Not an image')\n", " else:\n", " width, height = image.size\n", " image.save(filename)\n", " del response\n", " image_meta = {\n", " 'image_id': '{}-{}'.format(item['identifier'], page),\n", " 'identifier': item['identifier'],\n", " 'page': page,\n", " 'width': width,\n", " 'height': height\n", " }\n", " self.images.upsert(image_meta, Record.image_id == image_meta['image_id'])\n", " print('Image saved')\n", " time.sleep(1)\n", " \n", "def harvest_series(series):\n", " h = SeriesHarvester(series=series)\n", " h.start_harvest()\n", " \n", "def harvest_images(series):\n", " h = SeriesHarvester(series=series)\n", " h.harvest_images()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Harvest the metadata!\n", "harvest_series(series)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Harvest digitised pages\n", "harvest_images(series)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Working with the harvested data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Let's see how many items we've harvested\n", "db = TinyDB('data/db-{}.json'.format(series))\n", "items = db.table('items')\n", "len(items)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def convert_to_df(series):\n", " '''\n", " Get the series data from TinyDB and save as a Pandas dataframe.\n", " Also flattens the date dictionary, and does a bit of ordering.\n", " '''\n", " \n", " # Load the series db\n", " db = TinyDB('data/db-{}.json'.format(series))\n", " items = db.table('items')\n", " \n", " # Let's convert the database into a simple list\n", " item_list = [i for i in items]\n", " \n", " # Now let's turm that list into a Pandas Dataframe\n", " df = pd.DataFrame(item_list)\n", " \n", " # The 'contents_date' column is a dictionary, we need to flatten this out so we can easily work with the values\n", " df = pd.concat([df, pd.DataFrame((d for idx, d in df['contents_dates'].iteritems()))], axis=1)\n", "\n", " # Delete the old date field\n", " del df['contents_dates']\n", "\n", " # Rename column\n", " df.rename({'date_str': 'contents_dates'}, axis=1, inplace=True)\n", " \n", " # Put columns in preferred order\n", " df = df[['identifier', 'series', 'control_symbol', 'title', 'contents_dates', 'start_date', 'end_date', 'access_status', 'location', 'digitised_status', 'digitised_pages']]\n", " df.sort_values(['identifier'])\n", " \n", " return df " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def save_as_csv(series):\n", " '''\n", " Converts harvested data in TinyDB to a CSV file, via a Pandas dataframe.\n", " '''\n", " df = convert_to_df(series)\n", " df.to_csv('data/{}.csv'.format(series.replace('/', '-')), index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Save the harvested metadata as a CSV file\n", "save_as_csv(series)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Once you've saved a harvest as a CSV file, you can download it from the workbench [data directory](data/)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Harvesting large series" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def harvest_large_series(series, control_range=None):\n", " '''\n", " RecordSearch will not return more than 20,000 results.\n", " If a series has more than 20,000 items you'll need to break it up.\n", " The easiest way to do this is to add a param for control_symbol.\n", " This function will break break a series harvest down into a series of harvests --\n", " using each letter and number with a wildcard as the control_symbol parameter.\n", " This should be enough to harvest most large series, but in some cases you might need to supply a custom list of control_symbol prefixes.\n", " '''\n", " if not control_range:\n", " control_range = [letter + '*' for letter in string.ascii_uppercase] + [str(number) + '*' for number in range(0, 10)]\n", " for control in control_range:\n", " print(control)\n", " h = SeriesHarvester(series=series, control=control)\n", " h.start_harvest()\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Harvest a large series using the default control range\n", "harvest_large_series('B13')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# For series like A1 that use the year as the control symbol prefix, this range should work.\n", "control_range = [str(num) + '*' for num in range(2,10)] + ['1{}*'.format(num2) for num2 in [str(num) for num in range(0,9)]] + ['19{}*'.format(num2) for num2 in [str(num) for num in range(1,10)]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Use custom range to harvest a large series\n", "harvest_large_series('A1', control_range)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Harvest multiple series" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "series_list = ['A6119', 'A6122', 'A6126', 'A9626', 'A6335', 'B2836', 'A8703', 'A13828', 'A6281', 'A6285', 'A6283', 'A6282', 'A6126', 'A9106', 'A9108', 'A9105', 'A12694', 'D1902', 'D1915']\n", "for series in series_list:\n", " harvest_series(series)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }