{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Harvesting a series"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"alert alert-block alert-warning\">\n",
    "<b>THIS NOTEBOOK IS OUTDATED!</b> \n",
    "\n",
    "This notebook has been superseded by <a href=\"harvesting_items_from_a_search.ipynb\">brand new, super duper, notebook</a> that explores in much more detail how to harvest items from a search in RecordSearch. I've left it here so as not to break any links, but please don't use it.\n",
    "</div>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "import csv\n",
    "import os\n",
    "import math\n",
    "import string\n",
    "import requests\n",
    "import pandas as pd\n",
    "from slugify import slugify\n",
    "# from PIL import Image, ImageOps\n",
    "from requests import ConnectionError\n",
    "from recordsearch_tools.utilities import retry\n",
    "from recordsearch_tools.client import RSSearchClient, RSSeriesClient\n",
    "from tinydb import TinyDB, Query\n",
    "try:\n",
    "    from io import BytesIO\n",
    "except ImportError:\n",
    "    from StringIO import StringIO\n",
    "from IPython.display import Image as DImage\n",
    "from IPython.core.display import HTML\n",
    "\n",
    "# Make sure data directory exists\n",
    "os.makedirs('data/images', exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# What series do you want to harvest?\n",
    "# Insert the series id between the quotes.\n",
    "series = 'A821'\n",
    "output_dir = 'data'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The harvesting code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class SeriesHarvester():\n",
    "    def __init__(self, series, control=None):\n",
    "        self.series = series\n",
    "        self.control = control\n",
    "        self.total_pages = None\n",
    "        self.pages_complete = 0\n",
    "        self.client = RSSearchClient()\n",
    "        self.prepare_harvest()\n",
    "        self.db = TinyDB('data/db-{}.json'.format(self.series.replace('/', '-')))\n",
    "        self.items = self.db.table('items')\n",
    "        self.images = self.db.table('images')\n",
    "\n",
    "    def get_total(self):\n",
    "        return self.client.total_results\n",
    "\n",
    "    def prepare_harvest(self):\n",
    "        if self.control:\n",
    "            self.client.search(series=self.series, control=self.control)\n",
    "        else:\n",
    "            self.client.search(series=self.series)\n",
    "        total_results = self.client.total_results\n",
    "        print('{} items'.format(total_results))\n",
    "        self.total_pages = math.floor(int(total_results) / self.client.results_per_page) + 1\n",
    "        print(self.total_pages)\n",
    "\n",
    "    @retry(ConnectionError, tries=20, delay=10, backoff=1)\n",
    "    def start_harvest(self, page=None):\n",
    "        Record = Query()\n",
    "        if not page:\n",
    "            page = self.pages_complete + 1\n",
    "        while self.pages_complete < self.total_pages:\n",
    "            if self.control:\n",
    "                response = self.client.search(series=self.series, page=page, control=self.control, sort='9')\n",
    "            else:\n",
    "                response = self.client.search(series=self.series, page=page, sort='9')\n",
    "            for result in response['results']:\n",
    "                self.items.upsert(result, Record.identifier == result['identifier'])\n",
    "            self.pages_complete += 1\n",
    "            page += 1\n",
    "            print('{} pages complete'.format(self.pages_complete))\n",
    "            time.sleep(1)\n",
    "        \n",
    "    @retry(ConnectionError, tries=20, delay=10, backoff=1)\n",
    "    def harvest_images(self):\n",
    "        Record = Query()\n",
    "        items = self.items.search(Record.digitised_status == True)\n",
    "        headers = {'User-Agent': 'Mozilla/5.0'}\n",
    "        for item in items:\n",
    "            directory = os.path.join('data', 'images', '{}/{}-[{}]'.format(self.series.replace('/', '-'), item['control_symbol'].replace('/', '-').replace(' ', '-'), item['identifier']))\n",
    "            if not os.path.exists(directory):\n",
    "                os.makedirs(directory)\n",
    "            for page in range(1, item['digitised_pages'] + 1):\n",
    "                filename = '{}/{}-p{}.jpg'.format(directory, item['identifier'], page)\n",
    "                print('{}, p. {}'.format(item['identifier'], page))\n",
    "                if not os.path.exists(filename):\n",
    "                    img_url = 'http://recordsearch.naa.gov.au/NaaMedia/ShowImage.asp?B={}&S={}&T=P'.format(item['identifier'], page)\n",
    "                    response = requests.get(img_url, headers=headers, stream=True, verify=False)\n",
    "                    response.raise_for_status()\n",
    "                    try:\n",
    "                        image = Image.open(BytesIO(response.content))\n",
    "                    except IOError:\n",
    "                        print('Not an image')\n",
    "                    else:\n",
    "                        width, height = image.size\n",
    "                        image.save(filename)\n",
    "                        del response\n",
    "                        image_meta = {\n",
    "                            'image_id': '{}-{}'.format(item['identifier'], page),\n",
    "                            'identifier': item['identifier'],\n",
    "                            'page': page,\n",
    "                            'width': width,\n",
    "                            'height': height\n",
    "                        }\n",
    "                        self.images.upsert(image_meta, Record.image_id == image_meta['image_id'])\n",
    "                        print('Image saved')\n",
    "            time.sleep(1)\n",
    "            \n",
    "def harvest_series(series):\n",
    "    h = SeriesHarvester(series=series)\n",
    "    h.start_harvest()\n",
    "    \n",
    "def harvest_images(series):\n",
    "    h = SeriesHarvester(series=series)\n",
    "    h.harvest_images()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Harvest the metadata!\n",
    "harvest_series(series)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Harvest digitised pages\n",
    "harvest_images(series)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Working with the harvested data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's see how many items we've harvested\n",
    "db = TinyDB('data/db-{}.json'.format(series))\n",
    "items = db.table('items')\n",
    "len(items)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert_to_df(series):\n",
    "    '''\n",
    "    Get the series data from TinyDB and save as a Pandas dataframe.\n",
    "    Also flattens the date dictionary, and does a bit of ordering.\n",
    "    '''\n",
    "    \n",
    "    # Load the series db\n",
    "    db = TinyDB('data/db-{}.json'.format(series))\n",
    "    items = db.table('items')\n",
    "    \n",
    "    # Let's convert the database into a simple list\n",
    "    item_list = [i for i in items]\n",
    "    \n",
    "    # Now let's turm that list into a Pandas Dataframe\n",
    "    df = pd.DataFrame(item_list)\n",
    "    \n",
    "    # The 'contents_date' column is a dictionary, we need to flatten this out so we can easily work with the values\n",
    "    df = pd.concat([df, pd.DataFrame((d for idx, d in df['contents_dates'].iteritems()))], axis=1)\n",
    "\n",
    "    # Delete the old date field\n",
    "    del df['contents_dates']\n",
    "\n",
    "    # Rename column\n",
    "    df.rename({'date_str': 'contents_dates'}, axis=1, inplace=True)\n",
    "    \n",
    "    # Put columns in preferred order\n",
    "    df = df[['identifier', 'series', 'control_symbol', 'title', 'contents_dates', 'start_date', 'end_date', 'access_status', 'location', 'digitised_status', 'digitised_pages']]\n",
    "    df.sort_values(['identifier'])\n",
    "    \n",
    "    return df "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_as_csv(series):\n",
    "    '''\n",
    "    Converts harvested data in TinyDB to a CSV file, via a Pandas dataframe.\n",
    "    '''\n",
    "    df = convert_to_df(series)\n",
    "    df.to_csv('data/{}.csv'.format(series.replace('/', '-')), index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save the harvested metadata as a CSV file\n",
    "save_as_csv(series)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Once you've saved a harvest as a CSV file, you can download it from the workbench [data directory](data/)."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Harvesting large series"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def harvest_large_series(series, control_range=None):\n",
    "    '''\n",
    "    RecordSearch will not return more than 20,000 results.\n",
    "    If a series has more than 20,000 items you'll need to break it up.\n",
    "    The easiest way to do this is to add a param for control_symbol.\n",
    "    This function will break break a series harvest down into a series of harvests --\n",
    "    using each letter and number with a wildcard as the control_symbol parameter.\n",
    "    This should be enough to harvest most large series, but in some cases you might need to supply a custom list of control_symbol prefixes.\n",
    "    '''\n",
    "    if not control_range:\n",
    "        control_range = [letter + '*' for letter in string.ascii_uppercase] + [str(number) + '*' for number in range(0, 10)]\n",
    "    for control in control_range:\n",
    "        print(control)\n",
    "        h = SeriesHarvester(series=series, control=control)\n",
    "        h.start_harvest()\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Harvest a large series using the default control range\n",
    "harvest_large_series('B13')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# For series like A1 that use the year as the control symbol prefix, this range should work.\n",
    "control_range = [str(num) + '*' for num in range(2,10)] + ['1{}*'.format(num2) for num2 in [str(num) for num in range(0,9)]] + ['19{}*'.format(num2) for num2 in [str(num) for num in range(1,10)]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Use custom range to harvest a large series\n",
    "harvest_large_series('A1', control_range)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Harvest multiple series"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "series_list = ['A6119', 'A6122', 'A6126', 'A9626', 'A6335', 'B2836', 'A8703', 'A13828', 'A6281', 'A6285', 'A6283', 'A6282', 'A6126', 'A9106', 'A9108', 'A9105', 'A12694', 'D1902', 'D1915']\n",
    "for series in series_list:\n",
    "    harvest_series(series)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}