{ "metadata": { "name": "", "signature": "sha256:3a88faa0f425ee2c94f0c91a8057e7b2426c628436e9fb72d5ed17a570253415" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "HackFSM\n", "\n", "Relationship to other public APIs based on Solr?\n", "\n", "* http://www.hathitrust.org/htrc/solr-api\n", "* http://api.plos.org/solr/search-fields/\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Documentation:\n", " \n", "http://digitalhumanities.berkeley.edu/hackfsm/api/detail" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from settings import (HACKFSM_ID, HACKFSM_KEY, HACKFSM_BASEURL)\n", "from itertools import islice\n", "\n", "import logging\n", "import requests\n", "import json\n", "import urllib\n", "import urlparse\n", "\n", "from pandas import DataFrame, Series\n", "import pandas as pd\n", "import numpy as np\n", "\n", "logging.basicConfig(filename='Experiment_20140325_HackFSM.log',level=logging.WARNING)\n", "logger=logging.getLogger()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "def query(q, fl=\"id\"):\n", " url = \"{base_url}?\".format(base_url=HACKFSM_BASEURL) + \\\n", " urllib.urlencode({'q':q,\n", " 'fl':fl,\n", " 'wt':'json',\n", " 'app_id':HACKFSM_ID,\n", " 'app_key':HACKFSM_KEY})\n", " r = requests.get(url)\n", " return r.json()\n", " " ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "result = query(q=\"fsmTitle:Savio\")['response']\n", "result" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 3, "text": [ "{u'docs': [{u'id': u'ark:/13030/ft2f59n853'},\n", " {u'id': u'access143'},\n", " {u'id': u'ark:/13030/tf2q2n99d3'},\n", " {u'id': u'ark:/13030/tf3p3003k7'},\n", " {u'id': u'ark:/13030/tf5m3nb15b'},\n", " {u'id': u'ark:/13030/tf267n996q'},\n", " {u'id': u'access326'},\n", " {u'id': u'access327'},\n", " {u'id': u'access328'},\n", " {u'id': u'access329'},\n", " {u'id': u'access330'},\n", " {u'id': u'access331'},\n", " {u'id': u'access332'},\n", " {u'id': u'access333'},\n", " {u'id': u'access334'},\n", " {u'id': u'access335'},\n", " {u'id': u'access339'},\n", " {u'id': u'access340'},\n", " {u'id': u'access341'},\n", " {u'id': u'access343'},\n", " {u'id': u'access344'},\n", " {u'id': u'access345'},\n", " {u'id': u'access346'},\n", " {u'id': u'access347'},\n", " {u'id': u'access348'},\n", " {u'id': u'access365'},\n", " {u'id': u'access366'},\n", " {u'id': u'access367'},\n", " {u'id': u'access369'},\n", " {u'id': u'access370'}],\n", " u'numFound': 124,\n", " u'start': 0}" ] } ], "prompt_number": 3 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Paging through results" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# try again\n", "# http://stackoverflow.com/a/5724453/7782\n", "# http://excess.org/article/2013/02/itergen1/\n", "\n", "\n", "class my_g(object):\n", " def __init__(self,max_count):\n", " self._remaining = range(max_count)\n", " self._len = max_count\n", " def __iter__(self):\n", " return self\n", " def __len__(self):\n", " return self._len\n", " def next(self):\n", " if not self._remaining:\n", " raise StopIteration\n", " return self._remaining.pop(0)\n", "\n", "g=my_g(10)\n", "print len(g)\n", "list(g)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "10\n" ] }, { "metadata": {}, "output_type": "pyout", "prompt_number": 4, "text": [ "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "class FSM(object):\n", " def __init__(self, q, fl=\"id\", start=0, rows=30,\n", " base_url=HACKFSM_BASEURL, app_id=HACKFSM_ID, app_key=HACKFSM_KEY):\n", " self.q = q\n", " self.fl = fl\n", " self.start = start\n", " self.rows = rows\n", " \n", " self.base_url = base_url\n", " self.app_id = app_id\n", " self.app_key = app_key\n", "\n", " # get first page and numfound\n", " self.cursor = start \n", " \n", " # get the first page\n", " result = self._get_page(q, fl, self.cursor, self.rows)\n", " self.numfound = result['response']['numFound']\n", " \n", " def _check_status(self,result):\n", " \"\"\"throw exception if non-zero status\"\"\"\n", " if result['responseHeader']['status'] != 0:\n", " raise FSMException(\"status: \" + str(result['responseHeader']['status']))\n", "\n", " def _get_page(self, q, fl, start, rows):\n", " result = self._call_api(q, fl, start, rows)\n", " \n", " # update current page\n", " self.page = result['response']['docs']\n", " self.page_len = len(self.page)\n", " \n", " return result\n", " \n", " def _call_api(self, q, fl, start, rows):\n", " url = \"{base_url}?\".format(base_url=self.base_url) + \\\n", " urllib.urlencode({'q':q,\n", " 'fl':fl,\n", " 'wt':'json',\n", " 'start':start,\n", " 'row':rows,\n", " 'app_id':self.app_id,\n", " 'app_key':self.app_key})\n", "\n", " result = requests.get(url).json()\n", " self._check_status(result)\n", " \n", " # check whether we're getting fewer records than expected\n", " if len(result['response']['docs']) < rows:\n", " # are we at the end of the results\n", " if start + len(result['response']['docs']) != self.numfound:\n", " logger.warning(\"url:{url}, numfound:{numfound}, start+len{start_plus_len}\".format(url=url,\n", " numfound=self.numfound,\n", " start_plus_len=start + len(result['response']['docs'])))\n", " \n", " \n", " return result\n", "\n", " def __iter__(self):\n", " return self\n", " def __len__(self):\n", " return self.numfound\n", " def next(self):\n", " if not self.page:\n", " # retrieve next page and check whether there's anything left\n", " self.cursor += self.page_len\n", " result = self._get_page(self.q, self.fl, self.cursor, self.rows)\n", " \n", " if self.page_len == 0:\n", " raise StopIteration\n", " \n", " return self.page.pop(0)\n", " " ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "fsm = FSM(\"-fsmTeiUrl:[* TO *]\", fl=\"id,fsmTitle,fsmImageUrl,fsmDateCreated\")" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "len(fsm)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 7, "text": [ "685" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "results = list(islice(fsm,None))\n", "results[:10]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 8, "text": [ "[{u'fsmDateCreated': [u'Nov. 9, 1964'],\n", " u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9r90',\n", " u'http://nma.berkeley.edu/ark:/28722/bk0005j9s0j'],\n", " u'fsmTitle': [u'Professor John Searle speaking to crowd.'],\n", " u'id': u'ark:/13030/ft6k40080h'},\n", " {u'fsmDateCreated': [u'Dec. 2, 1964'],\n", " u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k2842',\n", " u'http://nma.berkeley.edu/ark:/28722/bk0005k285m'],\n", " u'fsmTitle': [u'Mario Savio speaking with reporters.'],\n", " u'id': u'ark:/13030/tf009n97vn'},\n", " {u'fsmDateCreated': [u'Dec. 2, 1964'],\n", " u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k2c2h',\n", " u'http://nma.berkeley.edu/ark:/28722/bk0005k2c32'],\n", " u'fsmTitle': [u'Joan Baez singing in front of Sproul Hall.'],\n", " u'id': u'ark:/13030/tf5j49n838'},\n", " {u'fsmDateCreated': [u'Dec. 3, 1964'],\n", " u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9z5w',\n", " u'http://nma.berkeley.edu/ark:/28722/bk0005j9z6f'],\n", " u'fsmTitle': [u'Girl student being booked on campus before being taken to jail.'],\n", " u'id': u'ark:/13030/ft700007tc'},\n", " {u'fsmDateCreated': [u'Oct. 5, 1964'],\n", " u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9n7b',\n", " u'http://nma.berkeley.edu/ark:/28722/bk0005j9n8w'],\n", " u'fsmTitle': [u'Bryan Turner speaking.'],\n", " u'id': u'ark:/13030/ft7n39p1mr'},\n", " {u'fsmDateCreated': [u'Nov. 9, 1964'],\n", " u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k1b6q',\n", " u'http://nma.berkeley.edu/ark:/28722/bk0005k1b78'],\n", " u'fsmTitle': [u'Steve Weissman speaking to crowd.'],\n", " u'id': u'ark:/13030/tf8w1006vp'},\n", " {u'fsmDateCreated': [u'Nov. 24, 1964'],\n", " u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9v37',\n", " u'http://nma.berkeley.edu/ark:/28722/bk0005j9v4s'],\n", " u'fsmTitle': [u'Professor Morris Hirsch speaking from Sproul steps.'],\n", " u'id': u'ark:/13030/ft9f59p3bw'},\n", " {u'fsmDateCreated': [u'Oct. 1, 1964'],\n", " u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k0v2s',\n", " u'http://nma.berkeley.edu/ark:/28722/bk0005k0v3b'],\n", " u'fsmTitle': [u'Crowd in Sproul Plaza.'],\n", " u'id': u'ark:/13030/tf0870010x'},\n", " {u'fsmDateCreated': [u'Dec. 3, 1964'],\n", " u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9z1p',\n", " u'http://nma.berkeley.edu/ark:/28722/bk0005j9z27'],\n", " u'fsmTitle': [u'Crowds in Sproul Plaza'],\n", " u'id': u'ark:/13030/ft8199p26d'},\n", " {u'fsmDateCreated': [u'Dec. 2, 1964'],\n", " u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9x7g',\n", " u'http://nma.berkeley.edu/ark:/28722/bk0005j9x81'],\n", " u'fsmTitle': [u'Professor David Hackett talking to his class.'],\n", " u'id': u'ark:/13030/ft9000102p'}]" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "df = DataFrame(results)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "len(df)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 10, "text": [ "685" ] } ], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "df.fsmImageUrl" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 11, "text": [ "0 [http://nma.berkeley.edu/ark:/28722/bk0005j9r9...\n", "1 [http://nma.berkeley.edu/ark:/28722/bk0005k284...\n", "2 [http://nma.berkeley.edu/ark:/28722/bk0005k2c2...\n", "3 [http://nma.berkeley.edu/ark:/28722/bk0005j9z5...\n", "4 [http://nma.berkeley.edu/ark:/28722/bk0005j9n7...\n", "5 [http://nma.berkeley.edu/ark:/28722/bk0005k1b6...\n", "6 [http://nma.berkeley.edu/ark:/28722/bk0005j9v3...\n", "7 [http://nma.berkeley.edu/ark:/28722/bk0005k0v2...\n", "8 [http://nma.berkeley.edu/ark:/28722/bk0005j9z1...\n", "9 [http://nma.berkeley.edu/ark:/28722/bk0005j9x7...\n", "10 [http://nma.berkeley.edu/ark:/28722/bk0005k232...\n", "11 [http://nma.berkeley.edu/ark:/28722/bk0005k047...\n", "12 [http://nma.berkeley.edu/ark:/28722/bk0005k1c8...\n", "13 [http://nma.berkeley.edu/ark:/28722/bk0005k110...\n", "14 [http://nma.berkeley.edu/ark:/28722/bk0005k276...\n", "...\n", "670 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "671 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "672 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "673 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "674 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "675 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "676 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "677 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "678 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "679 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "680 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "681 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "682 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "683 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "684 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "Name: fsmImageUrl, Length: 685, dtype: object" ] } ], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "from IPython.display import HTML\n", "from jinja2 import Template\n", "\n", "CSS = \"\"\"\n", "\n", "\"\"\"\n", "\n", "IMAGES_TEMPLATE = CSS + \"\"\"\n", "
\n", " | fsmCreator | \n", "fsmDateCreated | \n", "fsmIdentifier | \n", "fsmImageUrl | \n", "fsmNote | \n", "fsmPhysicalLocation | \n", "fsmRelatedIdentifier | \n", "fsmRelatedTitle | \n", "fsmTeiUrl | \n", "fsmTitle | \n", "fsmTypeOfResource | \n", "id | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "[Warren (Photographer)] | \n", "[Nov. 9, 1964] | \n", "[BANC PIC 1959.010 -- NEG pt.3 11-09-64.4] | \n", "[http://nma.berkeley.edu/ark:/28722/bk0005j9r9... | \n", "[Photographer] | \n", "[The Bancroft Library;;, University of Califor... | \n", "[http://bancroft.berkeley.edu/FSM/, BANC PIC 1... | \n", "[The Free Speech Movement Digital Archive, San... | \n", "NaN | \n", "[Professor John Searle speaking to crowd.] | \n", "[still image] | \n", "ark:/13030/ft6k40080h | \n", "
1 | \n", "[Steven Marcus] | \n", "[Dec. 2, 1964] | \n", "[BANC PIC 2000.002--NEG Strip 117:36] | \n", "[http://nma.berkeley.edu/ark:/28722/bk0005k284... | \n", "[Photographer] | \n", "[The Bancroft Library;;, University of Califor... | \n", "[http://bancroft.berkeley.edu/FSM/, BANC PIC 2... | \n", "[The Free Speech Movement Digital Archive, Ste... | \n", "NaN | \n", "[Mario Savio speaking with reporters.] | \n", "[still image] | \n", "ark:/13030/tf009n97vn | \n", "
2 | \n", "[Steven Marcus] | \n", "[Dec. 2, 1964] | \n", "[BANC PIC 2000.002--NEG Strip 122:42] | \n", "[http://nma.berkeley.edu/ark:/28722/bk0005k2c2... | \n", "[Photographer] | \n", "[The Bancroft Library;;, University of Califor... | \n", "[http://bancroft.berkeley.edu/FSM/, BANC PIC 2... | \n", "[The Free Speech Movement Digital Archive, Ste... | \n", "NaN | \n", "[Joan Baez singing in front of Sproul Hall.] | \n", "[still image] | \n", "ark:/13030/tf5j49n838 | \n", "
3 | \n", "[Jones (Photographer)] | \n", "[Dec. 3, 1964] | \n", "[BANC PIC 1959.010 -- NEG pt.3 12-03-64.2] | \n", "[http://nma.berkeley.edu/ark:/28722/bk0005j9z5... | \n", "[Photographer] | \n", "[The Bancroft Library;;, University of Califor... | \n", "[http://bancroft.berkeley.edu/FSM/, BANC PIC 1... | \n", "[The Free Speech Movement Digital Archive, San... | \n", "NaN | \n", "[Girl student being booked on campus before be... | \n", "[still image] | \n", "ark:/13030/ft700007tc | \n", "
4 | \n", "[Ingman (Photographer)] | \n", "[Oct. 5, 1964] | \n", "[BANC PIC 1959.010 -- NEG pt.3 10-05-64.4] | \n", "[http://nma.berkeley.edu/ark:/28722/bk0005j9n7... | \n", "[Photographer] | \n", "[The Bancroft Library;;, University of Califor... | \n", "[http://bancroft.berkeley.edu/FSM/, BANC PIC 1... | \n", "[The Free Speech Movement Digital Archive, San... | \n", "NaN | \n", "[Bryan Turner speaking.] | \n", "[still image] | \n", "ark:/13030/ft7n39p1mr | \n", "
5 rows \u00d7 12 columns
\n", "Int64Index([], dtype='int64') | \n", "Empty DataFrame | \n", "
0 rows \u00d7 11 columns
\n", "\n", " | fsmCreator | \n", "fsmDateCreated | \n", "fsmIdentifier | \n", "fsmImageUrl | \n", "fsmNote | \n", "fsmPhysicalLocation | \n", "fsmRelatedIdentifier | \n", "fsmRelatedTitle | \n", "fsmTitle | \n", "fsmTypeOfResource | \n", "id | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|
246 | \n", "[Hecker, Ron] | \n", "[Dec. 7, 1964] | \n", "NaN | \n", "[http://sunsite.berkeley.edu/FindingAids/dynaw... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "[Free Speech Movement Photographs Collection, ] | \n", "[Crowd in Sproul Plaza from Student Union balc... | \n", "NaN | \n", "UARC PIC 24B:2:22 | \n", "
247 | \n", "[Hecker, Ron] | \n", "[Dec. 7, 1964] | \n", "NaN | \n", "[http://sunsite.berkeley.edu/FindingAids/dynaw... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "[Free Speech Movement Photographs Collection, ] | \n", "[Crowd at Greek Theater] | \n", "NaN | \n", "UARC PIC 24B:2:17 | \n", "
248 | \n", "NaN | \n", "[Dec. 7, 1964] | \n", "NaN | \n", "[http://sunsite.berkeley.edu/FindingAids/dynaw... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "[Free Speech Movement Photographs Collection, ] | \n", "[View from inside Sproul Hall lobby looking th... | \n", "NaN | \n", "UARC PIC 24B:1:26 | \n", "
249 | \n", "[Hecker, Ron] | \n", "[Dec. 1964] | \n", "NaN | \n", "[http://sunsite.berkeley.edu/FindingAids/dynaw... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "[Free Speech Movement Photographs Collection, ] | \n", "[Student Strike] | \n", "NaN | \n", "UARC PIC 24B:2:6 | \n", "
250 | \n", "[Hecker, Ron] | \n", "[Dec. 7, 1964] | \n", "NaN | \n", "[http://sunsite.berkeley.edu/FindingAids/dynaw... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "[Free Speech Movement Photographs Collection, ] | \n", "[Crowd at Greek Theater] | \n", "NaN | \n", "UARC PIC 24B:2:21 | \n", "
5 rows \u00d7 11 columns
\n", "