{ "cells": [ { "cell_type": "markdown", "id": "813a86b3", "metadata": {}, "source": [ "# Querying and Downloading from the Internet Archive" ] }, { "cell_type": "markdown", "id": "027159cd", "metadata": {}, "source": [ "This worksheet shows how to query the Internet Archive with JSON and how to download from it." ] }, { "cell_type": "code", "execution_count": 127, "id": "16257e0b", "metadata": { "collapsed": false }, "outputs": [], "source": [ "import urllib2\n", "import json\n", "import re\n", "import pprint\n", "pp = pprint.PrettyPrinter(indent=4).pprint\n", "Q = urllib2.quote\n", "U = urllib2.unquote" ] }, { "cell_type": "markdown", "id": "ed7fca79", "metadata": {}, "source": [ "We construct a url-encoded query (can we also post JSON?)." ] }, { "cell_type": "code", "execution_count": 97, "id": "b09830f8", "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "http://archive.org/advancedsearch.php?q=title%3A%28alice%20in%20wonderland%29%20AND%20format%3A%28djvu%29&fl%5B%5D%3Didentifier&fl%5B%5D%3Dsource&fl%5B%5D%3Dtitle&rows=100&page=1&output=json\n" ] } ], "source": [ "query = Q(\"title:(alice in wonderland) AND format:(djvu)\")\n", "columns = \"&\".join([Q(s) for s in \"fl[]=identifier fl[]=source fl[]=title\".split()])\n", "params = \"rows=100&page=1&output=json\"\n", "url = \"http://archive.org/advancedsearch.php?q=\"+query+\"&\"+columns+\"&\"+params\n", "print url" ] }, { "cell_type": "code", "execution_count": 128, "id": "1e296144", "metadata": { "collapsed": false }, "outputs": [], "source": [ "# could we also post the query?\n", "#jdata = json.dumps({\"username\":\"...\", \"password\":\"...\"})\n", "#urllib2.urlopen(\"http://www.example.com/\", jdata)" ] }, { "cell_type": "markdown", "id": "e53b48f4", "metadata": {}, "source": [ "Now we read and parse the response." ] }, { "cell_type": "code", "execution_count": 129, "id": "1bac2f60", "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[u'start', u'numFound', u'docs']" ] }, "execution_count": 129, "metadata": {}, "output_type": "execute_result" } ], "source": [ "response = urllib2.urlopen(url).read()\n", "response = json.loads(response)[\"response\"]\n", "response.keys()" ] }, { "cell_type": "code", "execution_count": 131, "id": "c19f9166", "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "43" ] }, "execution_count": 131, "metadata": {}, "output_type": "execute_result" } ], "source": [ "response[\"numFound\"]" ] }, { "cell_type": "code", "execution_count": 132, "id": "f58eb244", "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "43" ] }, "execution_count": 132, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(response[\"docs\"])" ] }, { "cell_type": "markdown", "id": "2e9afd0a", "metadata": {}, "source": [ "Each doc contains a title and an identifier (we asked for those):" ] }, { "cell_type": "code", "execution_count": 101, "id": "bd307dc0", "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 Alice In Wonderland caralic\n", "1 Alice's Adventures In Wonderland AlicesAdventuresInWonderland\n", "2 Alice's Adventures in Wonderland AlicesAdventuresInWonderland_917\n", "3 Alice in Wonderland aliceinwonderla00carrgoog\n", "4 Alice's Adventures in Wonderland alicesadventure00jackgoog\n", "5 Alice's Adventures in Wonderland AlicesAdventuresInWonderland_841\n", "6 Alice's adventures in Wonderland alicesadventure00tenngoog\n", "7 Alice's Adventures in Wonderland alicesadventures00011gut\n", "8 Alice's adventures in Wonderland adventuresalices00carrrich\n", "9 Alice in Wonderland aliceinwonderlan00carriala\n" ] } ], "source": [ "for i,e in enumerate(response[\"docs\"][:10]):\n", " print i,e[\"title\"],e[\"identifier\"]" ] }, { "cell_type": "code", "execution_count": 102, "id": "2a9dd657", "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "u'AlicesAdventuresInWonderland_841'" ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "identifier = response[\"docs\"][5][\"identifier\"]\n", "identifier" ] }, { "cell_type": "markdown", "id": "4c6aa2b4", "metadata": {}, "source": [ "# Retrieving Details using the Identifier" ] }, { "cell_type": "markdown", "id": "2d5f9448", "metadata": {}, "source": [ "Once we have the identifier for a document, we can retrieve more info about it,\n", "again in JSON." ] }, { "cell_type": "code", "execution_count": 111, "id": "d7504a13", "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[u'files', u'misc', u'server', u'item', u'creativecommons', u'dir', u'metadata']\n", "ia701208.us.archive.org\n" ] } ], "source": [ "hit = urllib2.urlopen(\"http://archive.org/details/\"+Q(identifier)+\"?output=json\").read()\n", "hit = json.loads(hit)\n", "print hit.keys()\n", "print hit[\"server\"]" ] }, { "cell_type": "markdown", "id": "1c3d32e1", "metadata": {}, "source": [ "We're particularly interested in the files." ] }, { "cell_type": "code", "execution_count": 112, "id": "c78731ca", "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[u'sha1', u'format', u'source', u'mtime', u'crc32', u'md5', u'original', u'size']\n" ] } ], "source": [ "print hit[\"files\"].items()[0][1].keys()" ] }, { "cell_type": "markdown", "id": "dbcca733", "metadata": {}, "source": [ "The file list contains information about formats, sources, sizes, etc.\n", "We're looking for text." ] }, { "cell_type": "code", "execution_count": 123, "id": "30ae9136", "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "u'DjVu' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll.djvu\n", "u'Abbyy GZ' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_abbyy.gz\n", "u'Image Container PDF' u'original' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll.pdf\n", "u'Metadata' u'original' /AlicesAdventuresInWonderland_841_meta.xml\n", "u'Single Page Processed JP2 ZIP' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_jp2.zip\n", "u'DjVuTXT' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_djvu.txt\n", "u'Scandata' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_scandata.xml\n", "u'EPUB' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll.epub\n", "u'Metadata' u'original' /AlicesAdventuresInWonderland_841_files.xml\n", "u'Animated GIF' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll.gif\n", "u'Djvu XML' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_djvu.xml\n", "u'Additional Text PDF' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_text.pdf\n", "/86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_djvu.txt\n" ] } ], "source": [ "fname = None\n", "for k,v in hit[\"files\"].items():\n", " print repr(v[\"format\"]),repr(v[\"source\"]),k\n", " if v[\"format\"]==\"DjVuTXT\": fname = k\n", "print fname" ] }, { "cell_type": "markdown", "id": "f9c6f7ad", "metadata": {}, "source": [ "# Retrieving the File" ] }, { "cell_type": "markdown", "id": "e7c579fc", "metadata": {}, "source": [ "We can retrieve files from the `archive.org/download` URL, combining the identifier for the document and the specific file name." ] }, { "cell_type": "code", "execution_count": 138, "id": "cfedd3cf", "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "u'\\n\\n\\n1 \\n\\n\\n\\n\\nwtx \\n\\n\\n\\n\\n\\n% \\xa7eb vtfy \"tired of $LM&$ \\nby nzr sisfer* ojl. tdthlmnh \\n\\ndo : once or \"twice, sit ka.A \\nfittfottL tufa i&& Irotk ktv \\nS^^Mt Si ^ r w <-7i. fi\\xa3r own niind, \\n^aS w&^ as S/te- Could, fa'" ] }, "execution_count": 138, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text = urllib2.urlopen(\"http://archive.org/download/\"+Q(identifier)+fname).read()\n", "text = text.decode(\"utf-8\")\n", "text[:400]" ] }, { "cell_type": "markdown", "id": "68e95f62", "metadata": {}, "source": [ "We can now continue to process this text, for example with NLTK." ] }, { "cell_type": "code", "execution_count": 139, "id": "ad6c305b", "metadata": { "collapsed": false }, "outputs": [], "source": [ "import nltk\n", "tokens = nltk.tokenize.word_tokenize(text)" ] }, { "cell_type": "code", "execution_count": 140, "id": "d0495f59", "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[u'.', u'\\u25a0', u'/I', u'nor', u'way', u'\"', u'to', u'Uar', u'^', u'U', u'T', u'^', u'Ub', u'say', u'~', u'6', u'>', u'rfs', u'\\xab', u'#', u\"'\", u'cU', u'\\xb1', u'r', u',', u'dear', u';', u'*', u'UtL', u'U', u'too', u'\\xa3', u'*', u'\\xa3', u'e', u'r', u'(', u'vji', u'*', u'n.', u'$', u'U', u'idLca.', u'3', u'i&', u'A', u'ovtr', u'*', u'.', u'\\xa3', u'&-', u'-WO-rcLS', u')', u'tir', u'occurred', u'&', u'A-', u'*', u'*', u'*', u'that', u's', u'^', u'e', u'oll', u'^', u'H', u'tfi', u'kavt', u'woTuLkfttL', u'at', u'-tiiis', u',', u'(', u'rat', u'ai', u'Ofb', u'tirae', u',', u'l&', u'alt', u'\\u2022', u'seemed', u'auitl', u'natural', u'}', u'>', u'bu.t', u'wAe', u'*', u'.', u'\\xb1', u'kt', u'raUit', u'actadly', u'-took', u'QL', u'w', u'atch.', u'out']" ] }, "execution_count": 140, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokens[200:300]" ] }, { "cell_type": "code", "execution_count": null, "id": "f3a8be5e", "metadata": { "collapsed": false }, "outputs": [], "source": [] } ], "metadata": {}, "nbformat": 4, "nbformat_minor": 5 }