{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import the IA Python library\n",
    "import internetarchive as ia\n",
    "\n",
    "# Replace placeholder strings with your IA credentials (leaving the quote marks)\n",
    "ia_email = \"YOUR_EMAIL_HERE\"\n",
    "ia_password = \"YOUR_PASSWORD_HERE\"\n",
    "\n",
    "# add these credentials to the API's configuration object\n",
    "ia.configure(ia_email, ia_password)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# the requests library installed through conda\n",
    "import requests\n",
    "\n",
    "# a few other imports from the Python standard library\n",
    "import gzip\n",
    "import os\n",
    "import sys\n",
    "import xml.etree.ElementTree as ET"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# sample search (should yield two results)\n",
    "query = \"peter parley date:[1825 TO 1830] mediatype:texts\"\n",
    "vol_ids = [result['identifier'] for result in ia.search_items(query)]\n",
    "vol_ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define a function for downloading pictures from a given IA volume\n",
    "def ia_picture_download(item_id, out_dir=None):\n",
    "    \"\"\"\n",
    "    :param item_id: unique Internet Archive volume identifier\n",
    "    :param out_dir: destination for images; if None, no download\n",
    "    \n",
    "    Note: if supplied, out_dir must be an existing directory and\n",
    "    the caller must have write permissions in that directory\n",
    "    \n",
    "    :rtype list of pages with one or more blockType=Picture in Abbyy OCR data\n",
    "    \"\"\"\n",
    "\n",
    "    print(\"[{}] Starting processing\".format(item_id))\n",
    "    \n",
    "    # Use command-line client to see available metadata formats:\n",
    "    # `ia metadata formats VOLUME_ID`\n",
    "    \n",
    "    # for this lesson, only the Abbyy file is needed\n",
    "    returned_files = list(ia.get_files(item_id, formats=[\"Abbyy GZ\"]))\n",
    "    \n",
    "    # make sure something got returned\n",
    "    if len(returned_files) > 0:\n",
    "        abbyy_file = returned_files[0].name\n",
    "    else:\n",
    "        print(\"[{}] Could not get Abbyy file\".format(item_id))\n",
    "        return None\n",
    "    \n",
    "    # download the abbyy file to CWD\n",
    "    ia.download(item_id, formats=[\"Abbyy GZ\"], ignore_existing=True, destdir=os.getcwd(), no_directory=True)\n",
    "    \n",
    "    # collect the pages with at least one picture block\n",
    "    img_pages = []\n",
    "    \n",
    "    with gzip.open(abbyy_file) as fp:\n",
    "        tree = ET.parse(fp)\n",
    "        document = tree.getroot()\n",
    "        for i, page in enumerate(document):\n",
    "            for block in page:\n",
    "                try:\n",
    "                    if block.attrib['blockType'] == 'Picture':\n",
    "                        img_pages.append(i)\n",
    "                        break\n",
    "                except KeyError:\n",
    "                    continue\n",
    "    \n",
    "    # 0 is not a valid page for making GET requests to IA,\n",
    "    #yet sometimes it's in the zipped Abbyy file\n",
    "    img_pages = [page for page in img_pages if page > 0]\n",
    "    \n",
    "    # track for download progress report\n",
    "    total_pages = len(img_pages)\n",
    "\n",
    "    # OCR files are huge, so just delete once we have pagelist\n",
    "    os.remove(abbyy_file)\n",
    "    \n",
    "    # if out_dir is not None, then also download page images\n",
    "    if out_dir:\n",
    "        \n",
    "        # return if folder already exists (reasonable inference that volume already processed)\n",
    "        if os.path.isdir(out_dir):\n",
    "            print(\"[{}] Directory already exists.\".format(item_id))\n",
    "            return img_pages\n",
    "\n",
    "        # otherwise, create folder to put the images\n",
    "        print(\"[{}] Making directory {}\".format(item_id, out_dir))\n",
    "        os.makedirs(out_dir)\n",
    "        \n",
    "        # https://iiif.archivelab.org/iiif/documentation\n",
    "        urls = [\"https://iiif.archivelab.org/iiif/{}${}/full/full/0/default.jpg\".format(item_id, page) for page in img_pages]\n",
    "        \n",
    "        # no direct page download through API, DIY\n",
    "        for i, page, url in zip(range(1,total_pages), img_pages, urls):\n",
    "            rsp = requests.get(url, allow_redirects=True)\n",
    "            if rsp.status_code == 200:\n",
    "                print(\"[{}] Downloading page {} ({}/{})\".format(item_id, page, i+1, total_pages))\n",
    "                with open(os.path.join(out_dir, str(page) + \".jpg\"), \"wb\") as fp:\n",
    "                    fp.write(rsp.content)\n",
    "    \n",
    "    # return list of pages with 1+ picture blocks\n",
    "    return img_pages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# loop over our search results and call the function\n",
    "for item_id in vol_ids:\n",
    "    destination = os.path.join(\"items\", \"internetarchive\", item_id)\n",
    "    img_pages = ia_picture_download(item_id, out_dir=destination)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}