{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import the HT Data API wrapper\n",
    "from hathitrust_api import DataAPI\n",
    "\n",
    "# Replace placeholder strings with your HT credentials (leaving the quote marks)\n",
    "ht_access_key = \"YOUR_ACCESS_KEY_HERE\"\n",
    "ht_secret_key = \"YOUR_SECRET_KEY_HERE\"\n",
    "\n",
    "# instantiate the Data API connection object\n",
    "data_api = DataAPI(ht_access_key, ht_secret_key)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# assorted imports from Python standard library\n",
    "import json\n",
    "import os\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# JSON metadata file downloaded from HT\n",
    "metadata_path = \"554050894-1535834127.json\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# the preferred syntax for opening/closing files in Python\n",
    "with open(metadata_path, \"r\") as fp:\n",
    "    data = json.load(fp)\n",
    "\n",
    "# the last line in a cell is always excuted and its return value displayed\n",
    "data.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# the gathers field is what actually contains the list of volumes in the collection\n",
    "data['gathers']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# list comprehension to get only the volume ids\n",
    "vol_ids = [item['htitem_id'] for item in data['gathers']]\n",
    "vol_ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ht_picture_download(item_id, out_dir=None):\n",
    "    \"\"\"\n",
    "    :param item_id: unique HathiTrust volume identifier\n",
    "    :param out_dir: destination for images; if None, no download\n",
    "    \n",
    "    Note: if supplied, out_dir must be an existing directory and\n",
    "    the caller must have write permissions in that directory\n",
    "    \n",
    "    :rtype list of pages with IMAGE_ON_PAGE feature\n",
    "    \"\"\"\n",
    "    \n",
    "    print(\"[{}] Starting processing\".format(item_id))\n",
    "    \n",
    "    # metadata from API in json format (different than HT collection metadata)\n",
    "    meta = json.loads(data_api.getmeta(item_id, json=True))\n",
    "\n",
    "    # sequence gets us each page of the PDF in order, with any\n",
    "    # additional information that might be available for it\n",
    "    sequence = meta['htd:seqmap'][0]['htd:seq']\n",
    "\n",
    "    # list of pages with pictures (empty to start)\n",
    "    img_pages = []\n",
    "\n",
    "    # try/except block handles situation where no \"pfeats\" exist OR\n",
    "    # the sequence numbers are not numeric\n",
    "    for page in sequence:\n",
    "        try:\n",
    "            if 'IMAGE_ON_PAGE' in page['htd:pfeat']:\n",
    "                img_pages.append(int(page['pseq']))\n",
    "        except (KeyError, TypeError) as e:\n",
    "            continue\n",
    "        \n",
    "    # track for download progress report\n",
    "    total_pages = len(img_pages)\n",
    "\n",
    "    # if out_dir is not None, then also download page images\n",
    "    if out_dir:\n",
    "        \n",
    "         # return if folder already exists (reasonable inference that volume already processed)\n",
    "        if os.path.isdir(out_dir):\n",
    "            print(\"[{}] Directory already exists.\".format(item_id))\n",
    "            return img_pages\n",
    "\n",
    "        # otherwise, create folder to put the images\n",
    "        print(\"[{}] Making directory {}\".format(item_id, out_dir))\n",
    "        os.makedirs(out_dir)\n",
    "        \n",
    "        for i, page in enumerate(img_pages):\n",
    "            try:\n",
    "                # simple status message\n",
    "                print(\"[{}] Downloading page {} ({}/{})\".format(item_id, page, i+1, total_pages))\n",
    "                \n",
    "                img = data_api.getpageimage(item_id, page)\n",
    "            \n",
    "                img_out = os.path.join(out_dir, str(page) + \".jpg\")\n",
    "    \n",
    "                # write out the image\n",
    "                with open(img_out, 'wb') as fp:\n",
    "                    fp.write(img)\n",
    "\n",
    "                # to avoid exceeding the allowed API usage, we take a quick\n",
    "                # two-second break before requesting the next image\n",
    "                time.sleep(2)\n",
    "\n",
    "            except Exception as e:\n",
    "                print(\"[{}] Error downloading page {}: {}\".format(item_id, page,e))\n",
    "                \n",
    "    # return the list of image pages\n",
    "    return img_pages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# loop over volumes in our collection\n",
    "for item_id in vol_ids:\n",
    "    destination = os.path.join(\"items\", \"hathitrust\", item_id)\n",
    "    ht_picture_download(item_id, out_dir=destination)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}