{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "813a86b3",
   "metadata": {},
   "source": [
    "# Querying and Downloading from the Internet Archive"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "027159cd",
   "metadata": {},
   "source": [
    "This worksheet shows how to query the Internet Archive with JSON and how to download from it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "id": "16257e0b",
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import urllib2\n",
    "import json\n",
    "import re\n",
    "import pprint\n",
    "pp = pprint.PrettyPrinter(indent=4).pprint\n",
    "Q = urllib2.quote\n",
    "U = urllib2.unquote"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ed7fca79",
   "metadata": {},
   "source": [
    "We construct a url-encoded query (can we also post JSON?)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "id": "b09830f8",
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "http://archive.org/advancedsearch.php?q=title%3A%28alice%20in%20wonderland%29%20AND%20format%3A%28djvu%29&fl%5B%5D%3Didentifier&fl%5B%5D%3Dsource&fl%5B%5D%3Dtitle&rows=100&page=1&output=json\n"
     ]
    }
   ],
   "source": [
    "query = Q(\"title:(alice in wonderland) AND format:(djvu)\")\n",
    "columns = \"&\".join([Q(s) for s in \"fl[]=identifier fl[]=source fl[]=title\".split()])\n",
    "params = \"rows=100&page=1&output=json\"\n",
    "url = \"http://archive.org/advancedsearch.php?q=\"+query+\"&\"+columns+\"&\"+params\n",
    "print url"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "id": "1e296144",
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# could we also post the query?\n",
    "#jdata = json.dumps({\"username\":\"...\", \"password\":\"...\"})\n",
    "#urllib2.urlopen(\"http://www.example.com/\", jdata)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e53b48f4",
   "metadata": {},
   "source": [
    "Now we read and parse the response."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "id": "1bac2f60",
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[u'start', u'numFound', u'docs']"
      ]
     },
     "execution_count": 129,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "response = urllib2.urlopen(url).read()\n",
    "response = json.loads(response)[\"response\"]\n",
    "response.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "id": "c19f9166",
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "43"
      ]
     },
     "execution_count": 131,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "response[\"numFound\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "id": "f58eb244",
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "43"
      ]
     },
     "execution_count": 132,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(response[\"docs\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2e9afd0a",
   "metadata": {},
   "source": [
    "Each doc contains a title and an identifier (we asked for those):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "id": "bd307dc0",
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 Alice In Wonderland caralic\n",
      "1 Alice's Adventures In Wonderland AlicesAdventuresInWonderland\n",
      "2 Alice's Adventures in Wonderland AlicesAdventuresInWonderland_917\n",
      "3 Alice in Wonderland aliceinwonderla00carrgoog\n",
      "4 Alice's Adventures in Wonderland alicesadventure00jackgoog\n",
      "5 Alice's Adventures in Wonderland AlicesAdventuresInWonderland_841\n",
      "6 Alice's adventures in Wonderland alicesadventure00tenngoog\n",
      "7 Alice's Adventures in Wonderland alicesadventures00011gut\n",
      "8 Alice's adventures in Wonderland adventuresalices00carrrich\n",
      "9 Alice in Wonderland aliceinwonderlan00carriala\n"
     ]
    }
   ],
   "source": [
    "for i,e in enumerate(response[\"docs\"][:10]):\n",
    "    print i,e[\"title\"],e[\"identifier\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "id": "2a9dd657",
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "u'AlicesAdventuresInWonderland_841'"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "identifier = response[\"docs\"][5][\"identifier\"]\n",
    "identifier"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4c6aa2b4",
   "metadata": {},
   "source": [
    "# Retrieving Details using the Identifier"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2d5f9448",
   "metadata": {},
   "source": [
    "Once we have the identifier for a document, we can retrieve more info about it,\n",
    "again in JSON."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "id": "d7504a13",
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[u'files', u'misc', u'server', u'item', u'creativecommons', u'dir', u'metadata']\n",
      "ia701208.us.archive.org\n"
     ]
    }
   ],
   "source": [
    "hit = urllib2.urlopen(\"http://archive.org/details/\"+Q(identifier)+\"?output=json\").read()\n",
    "hit = json.loads(hit)\n",
    "print hit.keys()\n",
    "print hit[\"server\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1c3d32e1",
   "metadata": {},
   "source": [
    "We're particularly interested in the files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "id": "c78731ca",
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[u'sha1', u'format', u'source', u'mtime', u'crc32', u'md5', u'original', u'size']\n"
     ]
    }
   ],
   "source": [
    "print hit[\"files\"].items()[0][1].keys()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dbcca733",
   "metadata": {},
   "source": [
    "The file list contains information about formats, sources, sizes, etc.\n",
    "We're looking for text."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "id": "30ae9136",
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "u'DjVu' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll.djvu\n",
      "u'Abbyy GZ' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_abbyy.gz\n",
      "u'Image Container PDF' u'original' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll.pdf\n",
      "u'Metadata' u'original' /AlicesAdventuresInWonderland_841_meta.xml\n",
      "u'Single Page Processed JP2 ZIP' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_jp2.zip\n",
      "u'DjVuTXT' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_djvu.txt\n",
      "u'Scandata' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_scandata.xml\n",
      "u'EPUB' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll.epub\n",
      "u'Metadata' u'original' /AlicesAdventuresInWonderland_841_files.xml\n",
      "u'Animated GIF' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll.gif\n",
      "u'Djvu XML' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_djvu.xml\n",
      "u'Additional Text PDF' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_text.pdf\n",
      "/86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_djvu.txt\n"
     ]
    }
   ],
   "source": [
    "fname = None\n",
    "for k,v in hit[\"files\"].items():\n",
    "    print repr(v[\"format\"]),repr(v[\"source\"]),k\n",
    "    if v[\"format\"]==\"DjVuTXT\": fname = k\n",
    "print fname"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f9c6f7ad",
   "metadata": {},
   "source": [
    "# Retrieving the File"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e7c579fc",
   "metadata": {},
   "source": [
    "We can retrieve files from the `archive.org/download` URL, combining the identifier for the document and the specific file name."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "id": "cfedd3cf",
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "u'\\n\\n\\n1 \\n\\n\\n\\n\\nwtx \\n\\n\\n\\n\\n\\n% \\xa7eb vtfy \"tired of $LM&$ \\nby nzr sisfer* ojl. tdthlmnh \\n\\ndo : once or \"twice, sit ka.A \\nfittfottL tufa i&& Irotk ktv \\nS^^Mt Si ^ r w<t ^ riding, frat it \\nk<U Ko pictures or conversation*- in lt } ajruL wh&*. is tfa, \\nU&& of a- (rook t -ikoiL^kir Alice, , wii&out- pictures &<r can.* \\n-VtrScrtio-ns t So ska, MCLS cons.ttle.rino> <-7i. fi\\xa3r own niind, \\n^aS w&^ as S/te- Could, fa'"
      ]
     },
     "execution_count": 138,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text = urllib2.urlopen(\"http://archive.org/download/\"+Q(identifier)+fname).read()\n",
    "text = text.decode(\"utf-8\")\n",
    "text[:400]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "68e95f62",
   "metadata": {},
   "source": [
    "We can now continue to process this text, for example with NLTK."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "id": "ad6c305b",
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import nltk\n",
    "tokens = nltk.tokenize.word_tokenize(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "id": "d0495f59",
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[u'.', u'\\u25a0', u'/I', u'nor', u'way', u'\"', u'to', u'Uar', u'^', u'U', u'T', u'^', u'Ub', u'say', u'~', u'6', u'>', u'rfs', u'\\xab', u'#', u\"'\", u'cU', u'\\xb1', u'r', u',', u'dear', u';', u'*', u'UtL', u'U', u'too', u'\\xa3', u'*', u'\\xa3', u'e', u'r', u'(', u'vji', u'*', u'n.', u'$', u'U', u'idLca.', u'3', u'i&', u'A', u'ovtr', u'*', u'.', u'\\xa3', u'&-', u'-WO-rcLS', u')', u'tir', u'occurred', u'&', u'A-', u'*', u'*', u'*', u'that', u's', u'^', u'e', u'oll', u'^', u'H', u'tfi', u'kavt', u'woTuLkfttL', u'at', u'-tiiis', u',', u'(', u'rat', u'ai', u'Ofb', u'tirae', u',', u'l&', u'alt', u'\\u2022', u'seemed', u'auitl', u'natural', u'}', u'>', u'bu.t', u'wAe', u'*', u'.', u'\\xb1', u'kt', u'raUit', u'actadly', u'-took', u'QL', u'w', u'atch.', u'out']"
      ]
     },
     "execution_count": 140,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokens[200:300]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f3a8be5e",
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 5
}