{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Import the HT Data API wrapper\n", "from hathitrust_api import DataAPI\n", "\n", "# Replace placeholder strings with your HT credentials (leaving the quote marks)\n", "ht_access_key = \"YOUR_ACCESS_KEY_HERE\"\n", "ht_secret_key = \"YOUR_SECRET_KEY_HERE\"\n", "\n", "# instantiate the Data API connection object\n", "data_api = DataAPI(ht_access_key, ht_secret_key)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# assorted imports from Python standard library\n", "import json\n", "import os\n", "import time" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# JSON metadata file downloaded from HT\n", "metadata_path = \"554050894-1535834127.json\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# the preferred syntax for opening/closing files in Python\n", "with open(metadata_path, \"r\") as fp:\n", " data = json.load(fp)\n", "\n", "# the last line in a cell is always excuted and its return value displayed\n", "data.keys()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# the gathers field is what actually contains the list of volumes in the collection\n", "data['gathers']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# list comprehension to get only the volume ids\n", "vol_ids = [item['htitem_id'] for item in data['gathers']]\n", "vol_ids" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def ht_picture_download(item_id, out_dir=None):\n", " \"\"\"\n", " :param item_id: unique HathiTrust volume identifier\n", " :param out_dir: destination for images; if None, no download\n", " \n", " Note: if supplied, out_dir must be an existing directory and\n", " the caller must have write permissions in that directory\n", " \n", " :rtype list of pages with IMAGE_ON_PAGE feature\n", " \"\"\"\n", " \n", " print(\"[{}] Starting processing\".format(item_id))\n", " \n", " # metadata from API in json format (different than HT collection metadata)\n", " meta = json.loads(data_api.getmeta(item_id, json=True))\n", "\n", " # sequence gets us each page of the PDF in order, with any\n", " # additional information that might be available for it\n", " sequence = meta['htd:seqmap'][0]['htd:seq']\n", "\n", " # list of pages with pictures (empty to start)\n", " img_pages = []\n", "\n", " # try/except block handles situation where no \"pfeats\" exist OR\n", " # the sequence numbers are not numeric\n", " for page in sequence:\n", " try:\n", " if 'IMAGE_ON_PAGE' in page['htd:pfeat']:\n", " img_pages.append(int(page['pseq']))\n", " except (KeyError, TypeError) as e:\n", " continue\n", " \n", " # track for download progress report\n", " total_pages = len(img_pages)\n", "\n", " # if out_dir is not None, then also download page images\n", " if out_dir:\n", " \n", " # return if folder already exists (reasonable inference that volume already processed)\n", " if os.path.isdir(out_dir):\n", " print(\"[{}] Directory already exists.\".format(item_id))\n", " return img_pages\n", "\n", " # otherwise, create folder to put the images\n", " print(\"[{}] Making directory {}\".format(item_id, out_dir))\n", " os.makedirs(out_dir)\n", " \n", " for i, page in enumerate(img_pages):\n", " try:\n", " # simple status message\n", " print(\"[{}] Downloading page {} ({}/{})\".format(item_id, page, i+1, total_pages))\n", " \n", " img = data_api.getpageimage(item_id, page)\n", " \n", " img_out = os.path.join(out_dir, str(page) + \".jpg\")\n", " \n", " # write out the image\n", " with open(img_out, 'wb') as fp:\n", " fp.write(img)\n", "\n", " # to avoid exceeding the allowed API usage, we take a quick\n", " # two-second break before requesting the next image\n", " time.sleep(2)\n", "\n", " except Exception as e:\n", " print(\"[{}] Error downloading page {}: {}\".format(item_id, page,e))\n", " \n", " # return the list of image pages\n", " return img_pages" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# loop over volumes in our collection\n", "for item_id in vol_ids:\n", " destination = os.path.join(\"items\", \"hathitrust\", item_id)\n", " ht_picture_download(item_id, out_dir=destination)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }