\n",
"THIS NOTEBOOK IS OUTDATED! \n",
"\n",
"This notebook has been superseded by brand new, super duper, notebook that explores in much more detail how to harvest items from a search in RecordSearch. I've left it here so as not to break any links, but please don't use it.\n",
"
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"import csv\n",
"import os\n",
"import math\n",
"import string\n",
"import requests\n",
"import pandas as pd\n",
"from slugify import slugify\n",
"# from PIL import Image, ImageOps\n",
"from requests import ConnectionError\n",
"from recordsearch_tools.utilities import retry\n",
"from recordsearch_tools.client import RSSearchClient, RSSeriesClient\n",
"from tinydb import TinyDB, Query\n",
"try:\n",
" from io import BytesIO\n",
"except ImportError:\n",
" from StringIO import StringIO\n",
"from IPython.display import Image as DImage\n",
"from IPython.core.display import HTML\n",
"\n",
"# Make sure data directory exists\n",
"os.makedirs('data/images', exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# What series do you want to harvest?\n",
"# Insert the series id between the quotes.\n",
"series = 'A821'\n",
"output_dir = 'data'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## The harvesting code"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class SeriesHarvester():\n",
" def __init__(self, series, control=None):\n",
" self.series = series\n",
" self.control = control\n",
" self.total_pages = None\n",
" self.pages_complete = 0\n",
" self.client = RSSearchClient()\n",
" self.prepare_harvest()\n",
" self.db = TinyDB('data/db-{}.json'.format(self.series.replace('/', '-')))\n",
" self.items = self.db.table('items')\n",
" self.images = self.db.table('images')\n",
"\n",
" def get_total(self):\n",
" return self.client.total_results\n",
"\n",
" def prepare_harvest(self):\n",
" if self.control:\n",
" self.client.search(series=self.series, control=self.control)\n",
" else:\n",
" self.client.search(series=self.series)\n",
" total_results = self.client.total_results\n",
" print('{} items'.format(total_results))\n",
" self.total_pages = math.floor(int(total_results) / self.client.results_per_page) + 1\n",
" print(self.total_pages)\n",
"\n",
" @retry(ConnectionError, tries=20, delay=10, backoff=1)\n",
" def start_harvest(self, page=None):\n",
" Record = Query()\n",
" if not page:\n",
" page = self.pages_complete + 1\n",
" while self.pages_complete < self.total_pages:\n",
" if self.control:\n",
" response = self.client.search(series=self.series, page=page, control=self.control, sort='9')\n",
" else:\n",
" response = self.client.search(series=self.series, page=page, sort='9')\n",
" for result in response['results']:\n",
" self.items.upsert(result, Record.identifier == result['identifier'])\n",
" self.pages_complete += 1\n",
" page += 1\n",
" print('{} pages complete'.format(self.pages_complete))\n",
" time.sleep(1)\n",
" \n",
" @retry(ConnectionError, tries=20, delay=10, backoff=1)\n",
" def harvest_images(self):\n",
" Record = Query()\n",
" items = self.items.search(Record.digitised_status == True)\n",
" headers = {'User-Agent': 'Mozilla/5.0'}\n",
" for item in items:\n",
" directory = os.path.join('data', 'images', '{}/{}-[{}]'.format(self.series.replace('/', '-'), item['control_symbol'].replace('/', '-').replace(' ', '-'), item['identifier']))\n",
" if not os.path.exists(directory):\n",
" os.makedirs(directory)\n",
" for page in range(1, item['digitised_pages'] + 1):\n",
" filename = '{}/{}-p{}.jpg'.format(directory, item['identifier'], page)\n",
" print('{}, p. {}'.format(item['identifier'], page))\n",
" if not os.path.exists(filename):\n",
" img_url = 'http://recordsearch.naa.gov.au/NaaMedia/ShowImage.asp?B={}&S={}&T=P'.format(item['identifier'], page)\n",
" response = requests.get(img_url, headers=headers, stream=True, verify=False)\n",
" response.raise_for_status()\n",
" try:\n",
" image = Image.open(BytesIO(response.content))\n",
" except IOError:\n",
" print('Not an image')\n",
" else:\n",
" width, height = image.size\n",
" image.save(filename)\n",
" del response\n",
" image_meta = {\n",
" 'image_id': '{}-{}'.format(item['identifier'], page),\n",
" 'identifier': item['identifier'],\n",
" 'page': page,\n",
" 'width': width,\n",
" 'height': height\n",
" }\n",
" self.images.upsert(image_meta, Record.image_id == image_meta['image_id'])\n",
" print('Image saved')\n",
" time.sleep(1)\n",
" \n",
"def harvest_series(series):\n",
" h = SeriesHarvester(series=series)\n",
" h.start_harvest()\n",
" \n",
"def harvest_images(series):\n",
" h = SeriesHarvester(series=series)\n",
" h.harvest_images()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Harvest the metadata!\n",
"harvest_series(series)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Harvest digitised pages\n",
"harvest_images(series)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Working with the harvested data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Let's see how many items we've harvested\n",
"db = TinyDB('data/db-{}.json'.format(series))\n",
"items = db.table('items')\n",
"len(items)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def convert_to_df(series):\n",
" '''\n",
" Get the series data from TinyDB and save as a Pandas dataframe.\n",
" Also flattens the date dictionary, and does a bit of ordering.\n",
" '''\n",
" \n",
" # Load the series db\n",
" db = TinyDB('data/db-{}.json'.format(series))\n",
" items = db.table('items')\n",
" \n",
" # Let's convert the database into a simple list\n",
" item_list = [i for i in items]\n",
" \n",
" # Now let's turm that list into a Pandas Dataframe\n",
" df = pd.DataFrame(item_list)\n",
" \n",
" # The 'contents_date' column is a dictionary, we need to flatten this out so we can easily work with the values\n",
" df = pd.concat([df, pd.DataFrame((d for idx, d in df['contents_dates'].iteritems()))], axis=1)\n",
"\n",
" # Delete the old date field\n",
" del df['contents_dates']\n",
"\n",
" # Rename column\n",
" df.rename({'date_str': 'contents_dates'}, axis=1, inplace=True)\n",
" \n",
" # Put columns in preferred order\n",
" df = df[['identifier', 'series', 'control_symbol', 'title', 'contents_dates', 'start_date', 'end_date', 'access_status', 'location', 'digitised_status', 'digitised_pages']]\n",
" df.sort_values(['identifier'])\n",
" \n",
" return df "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def save_as_csv(series):\n",
" '''\n",
" Converts harvested data in TinyDB to a CSV file, via a Pandas dataframe.\n",
" '''\n",
" df = convert_to_df(series)\n",
" df.to_csv('data/{}.csv'.format(series.replace('/', '-')), index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Save the harvested metadata as a CSV file\n",
"save_as_csv(series)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Once you've saved a harvest as a CSV file, you can download it from the workbench [data directory](data/)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Harvesting large series"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def harvest_large_series(series, control_range=None):\n",
" '''\n",
" RecordSearch will not return more than 20,000 results.\n",
" If a series has more than 20,000 items you'll need to break it up.\n",
" The easiest way to do this is to add a param for control_symbol.\n",
" This function will break break a series harvest down into a series of harvests --\n",
" using each letter and number with a wildcard as the control_symbol parameter.\n",
" This should be enough to harvest most large series, but in some cases you might need to supply a custom list of control_symbol prefixes.\n",
" '''\n",
" if not control_range:\n",
" control_range = [letter + '*' for letter in string.ascii_uppercase] + [str(number) + '*' for number in range(0, 10)]\n",
" for control in control_range:\n",
" print(control)\n",
" h = SeriesHarvester(series=series, control=control)\n",
" h.start_harvest()\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Harvest a large series using the default control range\n",
"harvest_large_series('B13')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# For series like A1 that use the year as the control symbol prefix, this range should work.\n",
"control_range = [str(num) + '*' for num in range(2,10)] + ['1{}*'.format(num2) for num2 in [str(num) for num in range(0,9)]] + ['19{}*'.format(num2) for num2 in [str(num) for num in range(1,10)]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Use custom range to harvest a large series\n",
"harvest_large_series('A1', control_range)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Harvest multiple series"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"series_list = ['A6119', 'A6122', 'A6126', 'A9626', 'A6335', 'B2836', 'A8703', 'A13828', 'A6281', 'A6285', 'A6283', 'A6282', 'A6126', 'A9106', 'A9108', 'A9105', 'A12694', 'D1902', 'D1915']\n",
"for series in series_list:\n",
" harvest_series(series)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}