{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Import the IA Python library\n", "import internetarchive as ia\n", "\n", "# Replace placeholder strings with your IA credentials (leaving the quote marks)\n", "ia_email = \"YOUR_EMAIL_HERE\"\n", "ia_password = \"YOUR_PASSWORD_HERE\"\n", "\n", "# add these credentials to the API's configuration object\n", "ia.configure(ia_email, ia_password)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# the requests library installed through conda\n", "import requests\n", "\n", "# a few other imports from the Python standard library\n", "import gzip\n", "import os\n", "import sys\n", "import xml.etree.ElementTree as ET" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# sample search (should yield two results)\n", "query = \"peter parley date:[1825 TO 1830] mediatype:texts\"\n", "vol_ids = [result['identifier'] for result in ia.search_items(query)]\n", "vol_ids" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# define a function for downloading pictures from a given IA volume\n", "def ia_picture_download(item_id, out_dir=None):\n", " \"\"\"\n", " :param item_id: unique Internet Archive volume identifier\n", " :param out_dir: destination for images; if None, no download\n", " \n", " Note: if supplied, out_dir must be an existing directory and\n", " the caller must have write permissions in that directory\n", " \n", " :rtype list of pages with one or more blockType=Picture in Abbyy OCR data\n", " \"\"\"\n", "\n", " print(\"[{}] Starting processing\".format(item_id))\n", " \n", " # Use command-line client to see available metadata formats:\n", " # `ia metadata formats VOLUME_ID`\n", " \n", " # for this lesson, only the Abbyy file is needed\n", " returned_files = list(ia.get_files(item_id, formats=[\"Abbyy GZ\"]))\n", " \n", " # make sure something got returned\n", " if len(returned_files) > 0:\n", " abbyy_file = returned_files[0].name\n", " else:\n", " print(\"[{}] Could not get Abbyy file\".format(item_id))\n", " return None\n", " \n", " # download the abbyy file to CWD\n", " ia.download(item_id, formats=[\"Abbyy GZ\"], ignore_existing=True, destdir=os.getcwd(), no_directory=True)\n", " \n", " # collect the pages with at least one picture block\n", " img_pages = []\n", " \n", " with gzip.open(abbyy_file) as fp:\n", " tree = ET.parse(fp)\n", " document = tree.getroot()\n", " for i, page in enumerate(document):\n", " for block in page:\n", " try:\n", " if block.attrib['blockType'] == 'Picture':\n", " img_pages.append(i)\n", " break\n", " except KeyError:\n", " continue\n", " \n", " # 0 is not a valid page for making GET requests to IA,\n", " #yet sometimes it's in the zipped Abbyy file\n", " img_pages = [page for page in img_pages if page > 0]\n", " \n", " # track for download progress report\n", " total_pages = len(img_pages)\n", "\n", " # OCR files are huge, so just delete once we have pagelist\n", " os.remove(abbyy_file)\n", " \n", " # if out_dir is not None, then also download page images\n", " if out_dir:\n", " \n", " # return if folder already exists (reasonable inference that volume already processed)\n", " if os.path.isdir(out_dir):\n", " print(\"[{}] Directory already exists.\".format(item_id))\n", " return img_pages\n", "\n", " # otherwise, create folder to put the images\n", " print(\"[{}] Making directory {}\".format(item_id, out_dir))\n", " os.makedirs(out_dir)\n", " \n", " # https://iiif.archivelab.org/iiif/documentation\n", " urls = [\"https://iiif.archivelab.org/iiif/{}${}/full/full/0/default.jpg\".format(item_id, page) for page in img_pages]\n", " \n", " # no direct page download through API, DIY\n", " for i, page, url in zip(range(1,total_pages), img_pages, urls):\n", " rsp = requests.get(url, allow_redirects=True)\n", " if rsp.status_code == 200:\n", " print(\"[{}] Downloading page {} ({}/{})\".format(item_id, page, i+1, total_pages))\n", " with open(os.path.join(out_dir, str(page) + \".jpg\"), \"wb\") as fp:\n", " fp.write(rsp.content)\n", " \n", " # return list of pages with 1+ picture blocks\n", " return img_pages" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# loop over our search results and call the function\n", "for item_id in vol_ids:\n", " destination = os.path.join(\"items\", \"internetarchive\", item_id)\n", " img_pages = ia_picture_download(item_id, out_dir=destination)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }