{ "metadata": { "name": "", "signature": "sha256:3a88faa0f425ee2c94f0c91a8057e7b2426c628436e9fb72d5ed17a570253415" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "HackFSM\n", "\n", "Relationship to other public APIs based on Solr?\n", "\n", "* http://www.hathitrust.org/htrc/solr-api\n", "* http://api.plos.org/solr/search-fields/\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Documentation:\n", " \n", "http://digitalhumanities.berkeley.edu/hackfsm/api/detail" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from settings import (HACKFSM_ID, HACKFSM_KEY, HACKFSM_BASEURL)\n", "from itertools import islice\n", "\n", "import logging\n", "import requests\n", "import json\n", "import urllib\n", "import urlparse\n", "\n", "from pandas import DataFrame, Series\n", "import pandas as pd\n", "import numpy as np\n", "\n", "logging.basicConfig(filename='Experiment_20140325_HackFSM.log',level=logging.WARNING)\n", "logger=logging.getLogger()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "def query(q, fl=\"id\"):\n", " url = \"{base_url}?\".format(base_url=HACKFSM_BASEURL) + \\\n", " urllib.urlencode({'q':q,\n", " 'fl':fl,\n", " 'wt':'json',\n", " 'app_id':HACKFSM_ID,\n", " 'app_key':HACKFSM_KEY})\n", " r = requests.get(url)\n", " return r.json()\n", " " ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "result = query(q=\"fsmTitle:Savio\")['response']\n", "result" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 3, "text": [ "{u'docs': [{u'id': u'ark:/13030/ft2f59n853'},\n", " {u'id': u'access143'},\n", " {u'id': u'ark:/13030/tf2q2n99d3'},\n", " {u'id': u'ark:/13030/tf3p3003k7'},\n", " {u'id': u'ark:/13030/tf5m3nb15b'},\n", " {u'id': u'ark:/13030/tf267n996q'},\n", " {u'id': u'access326'},\n", " {u'id': u'access327'},\n", " {u'id': u'access328'},\n", " {u'id': u'access329'},\n", " {u'id': u'access330'},\n", " {u'id': u'access331'},\n", " {u'id': u'access332'},\n", " {u'id': u'access333'},\n", " {u'id': u'access334'},\n", " {u'id': u'access335'},\n", " {u'id': u'access339'},\n", " {u'id': u'access340'},\n", " {u'id': u'access341'},\n", " {u'id': u'access343'},\n", " {u'id': u'access344'},\n", " {u'id': u'access345'},\n", " {u'id': u'access346'},\n", " {u'id': u'access347'},\n", " {u'id': u'access348'},\n", " {u'id': u'access365'},\n", " {u'id': u'access366'},\n", " {u'id': u'access367'},\n", " {u'id': u'access369'},\n", " {u'id': u'access370'}],\n", " u'numFound': 124,\n", " u'start': 0}" ] } ], "prompt_number": 3 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Paging through results" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# try again\n", "# http://stackoverflow.com/a/5724453/7782\n", "# http://excess.org/article/2013/02/itergen1/\n", "\n", "\n", "class my_g(object):\n", " def __init__(self,max_count):\n", " self._remaining = range(max_count)\n", " self._len = max_count\n", " def __iter__(self):\n", " return self\n", " def __len__(self):\n", " return self._len\n", " def next(self):\n", " if not self._remaining:\n", " raise StopIteration\n", " return self._remaining.pop(0)\n", "\n", "g=my_g(10)\n", "print len(g)\n", "list(g)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "10\n" ] }, { "metadata": {}, "output_type": "pyout", "prompt_number": 4, "text": [ "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "class FSM(object):\n", " def __init__(self, q, fl=\"id\", start=0, rows=30,\n", " base_url=HACKFSM_BASEURL, app_id=HACKFSM_ID, app_key=HACKFSM_KEY):\n", " self.q = q\n", " self.fl = fl\n", " self.start = start\n", " self.rows = rows\n", " \n", " self.base_url = base_url\n", " self.app_id = app_id\n", " self.app_key = app_key\n", "\n", " # get first page and numfound\n", " self.cursor = start \n", " \n", " # get the first page\n", " result = self._get_page(q, fl, self.cursor, self.rows)\n", " self.numfound = result['response']['numFound']\n", " \n", " def _check_status(self,result):\n", " \"\"\"throw exception if non-zero status\"\"\"\n", " if result['responseHeader']['status'] != 0:\n", " raise FSMException(\"status: \" + str(result['responseHeader']['status']))\n", "\n", " def _get_page(self, q, fl, start, rows):\n", " result = self._call_api(q, fl, start, rows)\n", " \n", " # update current page\n", " self.page = result['response']['docs']\n", " self.page_len = len(self.page)\n", " \n", " return result\n", " \n", " def _call_api(self, q, fl, start, rows):\n", " url = \"{base_url}?\".format(base_url=self.base_url) + \\\n", " urllib.urlencode({'q':q,\n", " 'fl':fl,\n", " 'wt':'json',\n", " 'start':start,\n", " 'row':rows,\n", " 'app_id':self.app_id,\n", " 'app_key':self.app_key})\n", "\n", " result = requests.get(url).json()\n", " self._check_status(result)\n", " \n", " # check whether we're getting fewer records than expected\n", " if len(result['response']['docs']) < rows:\n", " # are we at the end of the results\n", " if start + len(result['response']['docs']) != self.numfound:\n", " logger.warning(\"url:{url}, numfound:{numfound}, start+len{start_plus_len}\".format(url=url,\n", " numfound=self.numfound,\n", " start_plus_len=start + len(result['response']['docs'])))\n", " \n", " \n", " return result\n", "\n", " def __iter__(self):\n", " return self\n", " def __len__(self):\n", " return self.numfound\n", " def next(self):\n", " if not self.page:\n", " # retrieve next page and check whether there's anything left\n", " self.cursor += self.page_len\n", " result = self._get_page(self.q, self.fl, self.cursor, self.rows)\n", " \n", " if self.page_len == 0:\n", " raise StopIteration\n", " \n", " return self.page.pop(0)\n", " " ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "fsm = FSM(\"-fsmTeiUrl:[* TO *]\", fl=\"id,fsmTitle,fsmImageUrl,fsmDateCreated\")" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "len(fsm)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 7, "text": [ "685" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "results = list(islice(fsm,None))\n", "results[:10]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 8, "text": [ "[{u'fsmDateCreated': [u'Nov. 9, 1964'],\n", " u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9r90',\n", " u'http://nma.berkeley.edu/ark:/28722/bk0005j9s0j'],\n", " u'fsmTitle': [u'Professor John Searle speaking to crowd.'],\n", " u'id': u'ark:/13030/ft6k40080h'},\n", " {u'fsmDateCreated': [u'Dec. 2, 1964'],\n", " u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k2842',\n", " u'http://nma.berkeley.edu/ark:/28722/bk0005k285m'],\n", " u'fsmTitle': [u'Mario Savio speaking with reporters.'],\n", " u'id': u'ark:/13030/tf009n97vn'},\n", " {u'fsmDateCreated': [u'Dec. 2, 1964'],\n", " u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k2c2h',\n", " u'http://nma.berkeley.edu/ark:/28722/bk0005k2c32'],\n", " u'fsmTitle': [u'Joan Baez singing in front of Sproul Hall.'],\n", " u'id': u'ark:/13030/tf5j49n838'},\n", " {u'fsmDateCreated': [u'Dec. 3, 1964'],\n", " u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9z5w',\n", " u'http://nma.berkeley.edu/ark:/28722/bk0005j9z6f'],\n", " u'fsmTitle': [u'Girl student being booked on campus before being taken to jail.'],\n", " u'id': u'ark:/13030/ft700007tc'},\n", " {u'fsmDateCreated': [u'Oct. 5, 1964'],\n", " u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9n7b',\n", " u'http://nma.berkeley.edu/ark:/28722/bk0005j9n8w'],\n", " u'fsmTitle': [u'Bryan Turner speaking.'],\n", " u'id': u'ark:/13030/ft7n39p1mr'},\n", " {u'fsmDateCreated': [u'Nov. 9, 1964'],\n", " u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k1b6q',\n", " u'http://nma.berkeley.edu/ark:/28722/bk0005k1b78'],\n", " u'fsmTitle': [u'Steve Weissman speaking to crowd.'],\n", " u'id': u'ark:/13030/tf8w1006vp'},\n", " {u'fsmDateCreated': [u'Nov. 24, 1964'],\n", " u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9v37',\n", " u'http://nma.berkeley.edu/ark:/28722/bk0005j9v4s'],\n", " u'fsmTitle': [u'Professor Morris Hirsch speaking from Sproul steps.'],\n", " u'id': u'ark:/13030/ft9f59p3bw'},\n", " {u'fsmDateCreated': [u'Oct. 1, 1964'],\n", " u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005k0v2s',\n", " u'http://nma.berkeley.edu/ark:/28722/bk0005k0v3b'],\n", " u'fsmTitle': [u'Crowd in Sproul Plaza.'],\n", " u'id': u'ark:/13030/tf0870010x'},\n", " {u'fsmDateCreated': [u'Dec. 3, 1964'],\n", " u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9z1p',\n", " u'http://nma.berkeley.edu/ark:/28722/bk0005j9z27'],\n", " u'fsmTitle': [u'Crowds in Sproul Plaza'],\n", " u'id': u'ark:/13030/ft8199p26d'},\n", " {u'fsmDateCreated': [u'Dec. 2, 1964'],\n", " u'fsmImageUrl': [u'http://nma.berkeley.edu/ark:/28722/bk0005j9x7g',\n", " u'http://nma.berkeley.edu/ark:/28722/bk0005j9x81'],\n", " u'fsmTitle': [u'Professor David Hackett talking to his class.'],\n", " u'id': u'ark:/13030/ft9000102p'}]" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "df = DataFrame(results)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "len(df)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 10, "text": [ "685" ] } ], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "df.fsmImageUrl" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 11, "text": [ "0 [http://nma.berkeley.edu/ark:/28722/bk0005j9r9...\n", "1 [http://nma.berkeley.edu/ark:/28722/bk0005k284...\n", "2 [http://nma.berkeley.edu/ark:/28722/bk0005k2c2...\n", "3 [http://nma.berkeley.edu/ark:/28722/bk0005j9z5...\n", "4 [http://nma.berkeley.edu/ark:/28722/bk0005j9n7...\n", "5 [http://nma.berkeley.edu/ark:/28722/bk0005k1b6...\n", "6 [http://nma.berkeley.edu/ark:/28722/bk0005j9v3...\n", "7 [http://nma.berkeley.edu/ark:/28722/bk0005k0v2...\n", "8 [http://nma.berkeley.edu/ark:/28722/bk0005j9z1...\n", "9 [http://nma.berkeley.edu/ark:/28722/bk0005j9x7...\n", "10 [http://nma.berkeley.edu/ark:/28722/bk0005k232...\n", "11 [http://nma.berkeley.edu/ark:/28722/bk0005k047...\n", "12 [http://nma.berkeley.edu/ark:/28722/bk0005k1c8...\n", "13 [http://nma.berkeley.edu/ark:/28722/bk0005k110...\n", "14 [http://nma.berkeley.edu/ark:/28722/bk0005k276...\n", "...\n", "670 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "671 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "672 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "673 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "674 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "675 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "676 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "677 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "678 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "679 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "680 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "681 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "682 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "683 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "684 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "Name: fsmImageUrl, Length: 685, dtype: object" ] } ], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "from IPython.display import HTML\n", "from jinja2 import Template\n", "\n", "CSS = \"\"\"\n", "\n", "\"\"\"\n", "\n", "IMAGES_TEMPLATE = CSS + \"\"\"\n", "
\n", " {% for item in items %}{% endfor %}\n", "
\n", "\"\"\"\n", " \n", "template = Template(IMAGES_TEMPLATE)\n", "HTML(template.render(items=results[:10])) \n" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "\n", "\n", "\n", "
\n", " \n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 12, "text": [ "" ] } ], "prompt_number": 12 }, { "cell_type": "markdown", "metadata": {}, "source": [ "# DISTINGUISHING IMAGES FROM DOCUMENTS\n", "\n", "To programmatically differentiate records that describe images from records that describe TEI-encoded XML documents, the API permits queries that exclude records with NULL values in the \"unwanted\" Url field.\n", " \n", "That is, to retrieve TEI documents only, one would query for null values in the `fsmImageUrl` field. To retrieve images only, one would query for null values in the `fsmTeiUrl` field.\n", " \n", "NOTE: Please observe the hyphen prepended to the field names in the examples below. The hyphen (minus sign) functions here as a NOT operator.\n", " \n", "Example that selects for TEI encoded XML documents by excluding null values of `fsmImageUrl`:\n", " \n", " https:///solr/fsm/select?q=-fsmImageUrl:[* TO *]&wt=json&indent=true&app_id=abcdefgh&app_key=12345678901234567890123456789012\n", " \n", "Example that selects for images by excluding null values of fsmTeiUrl:\n", " \n", " https:///solr/fsm/select?q=-fsmTeiUrl:[* TO *]&wt=json&indent=true&app_id=abcdefgh&app_key=12345678901234567890123456789012" ] }, { "cell_type": "code", "collapsed": false, "input": [ "# TEI-encoded docs\n", "\n", "len(FSM(\"-fsmImageUrl:[* TO *]\"))\n" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 13, "text": [ "194" ] } ], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "# images\n", "\n", "len(FSM(\"-fsmTeiUrl:[* TO *]\", fl=\"id,fsmImageUrl\"))" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 14, "text": [ "685" ] } ], "prompt_number": 14 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Studying the API parameters" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from lxml.html import parse, fromstring\n", "from collections import OrderedDict\n", "\n", "api_docs_url = \"http://digitalhumanities.berkeley.edu/hackfsm/api/detail\"\n", "r = requests.get(api_docs_url).content\n", "doc = fromstring(r)\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "rows = doc.xpath('//div[@id=\"content\"]/article/div/div/div/table[1]//tr')\n", "headers = [col.text_content().strip() for col in rows[0].findall('td')]\n", "headers" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 16, "text": [ "['Field Name', 'Definitions']" ] } ], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "fields = []\n", "\n", "for row in rows[1:]:\n", " field = [col.text_content().strip() for col in row.findall('td')]\n", " fields.append(field)\n", " \n", "fsmfields = OrderedDict(fields)\n", "fsmfields.keys()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 17, "text": [ "['id',\n", " 'fsmTitle',\n", " 'fsmCreator',\n", " 'fsmTypeOfResource',\n", " 'fsmDateCreated',\n", " 'fsmNote',\n", " 'fsmRelatedTitle',\n", " 'fsmIdentifier',\n", " 'fsmRelatedIdentifier',\n", " 'fsmPhysicalLocation',\n", " 'fsmImageUrl',\n", " 'fsmTeiUrl']" ] } ], "prompt_number": 17 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Study all the records" ] }, { "cell_type": "code", "collapsed": false, "input": [ "fsm = FSM(q=\"*\",fl=\",\".join(fsmfields.keys()))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [ "len(fsm)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 19, "text": [ "879" ] } ], "prompt_number": 19 }, { "cell_type": "code", "collapsed": false, "input": [ "df = DataFrame(list(fsm))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 20 }, { "cell_type": "code", "collapsed": false, "input": [ "len(df)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 21, "text": [ "879" ] } ], "prompt_number": 21 }, { "cell_type": "code", "collapsed": false, "input": [ "df.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fsmCreatorfsmDateCreatedfsmIdentifierfsmImageUrlfsmNotefsmPhysicalLocationfsmRelatedIdentifierfsmRelatedTitlefsmTeiUrlfsmTitlefsmTypeOfResourceid
0 [Warren (Photographer)] [Nov. 9, 1964] [BANC PIC 1959.010 -- NEG pt.3 11-09-64.4] [http://nma.berkeley.edu/ark:/28722/bk0005j9r9... [Photographer] [The Bancroft Library;;, University of Califor... [http://bancroft.berkeley.edu/FSM/, BANC PIC 1... [The Free Speech Movement Digital Archive, San... NaN [Professor John Searle speaking to crowd.] [still image] ark:/13030/ft6k40080h
1 [Steven Marcus] [Dec. 2, 1964] [BANC PIC 2000.002--NEG Strip 117:36] [http://nma.berkeley.edu/ark:/28722/bk0005k284... [Photographer] [The Bancroft Library;;, University of Califor... [http://bancroft.berkeley.edu/FSM/, BANC PIC 2... [The Free Speech Movement Digital Archive, Ste... NaN [Mario Savio speaking with reporters.] [still image] ark:/13030/tf009n97vn
2 [Steven Marcus] [Dec. 2, 1964] [BANC PIC 2000.002--NEG Strip 122:42] [http://nma.berkeley.edu/ark:/28722/bk0005k2c2... [Photographer] [The Bancroft Library;;, University of Califor... [http://bancroft.berkeley.edu/FSM/, BANC PIC 2... [The Free Speech Movement Digital Archive, Ste... NaN [Joan Baez singing in front of Sproul Hall.] [still image] ark:/13030/tf5j49n838
3 [Jones (Photographer)] [Dec. 3, 1964] [BANC PIC 1959.010 -- NEG pt.3 12-03-64.2] [http://nma.berkeley.edu/ark:/28722/bk0005j9z5... [Photographer] [The Bancroft Library;;, University of Califor... [http://bancroft.berkeley.edu/FSM/, BANC PIC 1... [The Free Speech Movement Digital Archive, San... NaN [Girl student being booked on campus before be... [still image] ark:/13030/ft700007tc
4 [Ingman (Photographer)] [Oct. 5, 1964] [BANC PIC 1959.010 -- NEG pt.3 10-05-64.4] [http://nma.berkeley.edu/ark:/28722/bk0005j9n7... [Photographer] [The Bancroft Library;;, University of Califor... [http://bancroft.berkeley.edu/FSM/, BANC PIC 1... [The Free Speech Movement Digital Archive, San... NaN [Bryan Turner speaking.] [still image] ark:/13030/ft7n39p1mr
\n", "

5 rows \u00d7 12 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 22, "text": [ " fsmCreator fsmDateCreated \\\n", "0 [Warren (Photographer)] [Nov. 9, 1964] \n", "1 [Steven Marcus] [Dec. 2, 1964] \n", "2 [Steven Marcus] [Dec. 2, 1964] \n", "3 [Jones (Photographer)] [Dec. 3, 1964] \n", "4 [Ingman (Photographer)] [Oct. 5, 1964] \n", "\n", " fsmIdentifier \\\n", "0 [BANC PIC 1959.010 -- NEG pt.3 11-09-64.4] \n", "1 [BANC PIC 2000.002--NEG Strip 117:36] \n", "2 [BANC PIC 2000.002--NEG Strip 122:42] \n", "3 [BANC PIC 1959.010 -- NEG pt.3 12-03-64.2] \n", "4 [BANC PIC 1959.010 -- NEG pt.3 10-05-64.4] \n", "\n", " fsmImageUrl fsmNote \\\n", "0 [http://nma.berkeley.edu/ark:/28722/bk0005j9r9... [Photographer] \n", "1 [http://nma.berkeley.edu/ark:/28722/bk0005k284... [Photographer] \n", "2 [http://nma.berkeley.edu/ark:/28722/bk0005k2c2... [Photographer] \n", "3 [http://nma.berkeley.edu/ark:/28722/bk0005j9z5... [Photographer] \n", "4 [http://nma.berkeley.edu/ark:/28722/bk0005j9n7... [Photographer] \n", "\n", " fsmPhysicalLocation \\\n", "0 [The Bancroft Library;;, University of Califor... \n", "1 [The Bancroft Library;;, University of Califor... \n", "2 [The Bancroft Library;;, University of Califor... \n", "3 [The Bancroft Library;;, University of Califor... \n", "4 [The Bancroft Library;;, University of Califor... \n", "\n", " fsmRelatedIdentifier \\\n", "0 [http://bancroft.berkeley.edu/FSM/, BANC PIC 1... \n", "1 [http://bancroft.berkeley.edu/FSM/, BANC PIC 2... \n", "2 [http://bancroft.berkeley.edu/FSM/, BANC PIC 2... \n", "3 [http://bancroft.berkeley.edu/FSM/, BANC PIC 1... \n", "4 [http://bancroft.berkeley.edu/FSM/, BANC PIC 1... \n", "\n", " fsmRelatedTitle fsmTeiUrl \\\n", "0 [The Free Speech Movement Digital Archive, San... NaN \n", "1 [The Free Speech Movement Digital Archive, Ste... NaN \n", "2 [The Free Speech Movement Digital Archive, Ste... NaN \n", "3 [The Free Speech Movement Digital Archive, San... NaN \n", "4 [The Free Speech Movement Digital Archive, San... NaN \n", "\n", " fsmTitle fsmTypeOfResource \\\n", "0 [Professor John Searle speaking to crowd.] [still image] \n", "1 [Mario Savio speaking with reporters.] [still image] \n", "2 [Joan Baez singing in front of Sproul Hall.] [still image] \n", "3 [Girl student being booked on campus before be... [still image] \n", "4 [Bryan Turner speaking.] [still image] \n", "\n", " id \n", "0 ark:/13030/ft6k40080h \n", "1 ark:/13030/tf009n97vn \n", "2 ark:/13030/tf5j49n838 \n", "3 ark:/13030/ft700007tc \n", "4 ark:/13030/ft7n39p1mr \n", "\n", "[5 rows x 12 columns]" ] } ], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": [ "# TEI URIs\n", "\n", "len(list(df[~df.fsmTeiUrl.isnull()].fsmTeiUrl.apply(lambda a: a[0])))" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 23, "text": [ "194" ] } ], "prompt_number": 23 }, { "cell_type": "code", "collapsed": false, "input": [ "# null dates\n", "\n", "len(df[df.fsmDateCreated.isnull()])" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 24, "text": [ "393" ] } ], "prompt_number": 24 }, { "cell_type": "code", "collapsed": false, "input": [ "# non-null image URLs\n", "len(df[~df.fsmImageUrl.isnull()])" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 25, "text": [ "685" ] } ], "prompt_number": 25 }, { "cell_type": "code", "collapsed": false, "input": [ "df[~df.fsmImageUrl.isnull()].id" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 26, "text": [ "0 ark:/13030/ft6k40080h\n", "1 ark:/13030/tf009n97vn\n", "2 ark:/13030/tf5j49n838\n", "3 ark:/13030/ft700007tc\n", "4 ark:/13030/ft7n39p1mr\n", "5 ark:/13030/tf8w1006vp\n", "6 ark:/13030/ft9f59p3bw\n", "7 ark:/13030/tf0870010x\n", "8 ark:/13030/ft8199p26d\n", "9 ark:/13030/ft9000102p\n", "10 ark:/13030/tf7n39n9qb\n", "11 ark:/13030/ft3c6004k4\n", "12 ark:/13030/tf8n39p05g\n", "13 ark:/13030/tf20000235\n", "14 ark:/13030/tf0d5n97ws\n", "...\n", "670 access369\n", "671 access370\n", "672 access371\n", "673 access372\n", "674 access373\n", "675 access374\n", "676 access375\n", "677 access376\n", "678 access377\n", "679 access378\n", "680 access379\n", "681 access380\n", "682 access381\n", "683 access382\n", "684 access383\n", "Name: id, Length: 685, dtype: object" ] } ], "prompt_number": 26 }, { "cell_type": "code", "collapsed": false, "input": [ "# distribution of number of image URLs\n", "\n", "df[~df.fsmImageUrl.isnull()].fsmImageUrl.apply(len).value_counts()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 27, "text": [ "2 628\n", "3 56\n", "4 1\n", "dtype: int64" ] } ], "prompt_number": 27 }, { "cell_type": "code", "collapsed": false, "input": [ "# let's crawl for images\n", "\n", "results_images = list(FSM(\"-fsmTeiUrl:[* TO *]\", fl=\",\".join(fsmfields.keys())))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 28 }, { "cell_type": "code", "collapsed": false, "input": [ "len(results_images)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 29, "text": [ "685" ] } ], "prompt_number": 29 }, { "cell_type": "code", "collapsed": false, "input": [ "df_images=DataFrame(results_images)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 30 }, { "cell_type": "code", "collapsed": false, "input": [ "df_images[df_images.fsmImageUrl.isnull()]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "
Int64Index([], dtype='int64')Empty DataFrame
\n", "

0 rows \u00d7 11 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 31, "text": [ "Empty DataFrame\n", "Columns: [fsmCreator, fsmDateCreated, fsmIdentifier, fsmImageUrl, fsmNote, fsmPhysicalLocation, fsmRelatedIdentifier, fsmRelatedTitle, fsmTitle, fsmTypeOfResource, id]\n", "Index: []\n", "\n", "[0 rows x 11 columns]" ] } ], "prompt_number": 31 }, { "cell_type": "code", "collapsed": false, "input": [ "# would be interesting to see sizes of images and whether we can get at thumbnails\n", "df_images.fsmImageUrl" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 32, "text": [ "0 [http://nma.berkeley.edu/ark:/28722/bk0005j9r9...\n", "1 [http://nma.berkeley.edu/ark:/28722/bk0005k284...\n", "2 [http://nma.berkeley.edu/ark:/28722/bk0005k2c2...\n", "3 [http://nma.berkeley.edu/ark:/28722/bk0005j9z5...\n", "4 [http://nma.berkeley.edu/ark:/28722/bk0005j9n7...\n", "5 [http://nma.berkeley.edu/ark:/28722/bk0005k1b6...\n", "6 [http://nma.berkeley.edu/ark:/28722/bk0005j9v3...\n", "7 [http://nma.berkeley.edu/ark:/28722/bk0005k0v2...\n", "8 [http://nma.berkeley.edu/ark:/28722/bk0005j9z1...\n", "9 [http://nma.berkeley.edu/ark:/28722/bk0005j9x7...\n", "10 [http://nma.berkeley.edu/ark:/28722/bk0005k232...\n", "11 [http://nma.berkeley.edu/ark:/28722/bk0005k047...\n", "12 [http://nma.berkeley.edu/ark:/28722/bk0005k1c8...\n", "13 [http://nma.berkeley.edu/ark:/28722/bk0005k110...\n", "14 [http://nma.berkeley.edu/ark:/28722/bk0005k276...\n", "...\n", "670 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "671 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "672 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "673 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "674 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "675 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "676 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "677 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "678 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "679 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "680 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "681 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "682 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "683 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "684 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "Name: fsmImageUrl, Length: 685, dtype: object" ] } ], "prompt_number": 32 }, { "cell_type": "markdown", "metadata": {}, "source": [ "http://content.cdlib.org/ark:/13030/tf1z09n5r1/thumbnail ->\n", "http://digitalassets.lib.berkeley.edu/fsm/ucb/images/brk00040569b_a.gif\n", "\n", "![Mario Savio addressing the crowd (thumbnail)](http://content.cdlib.org/ark:/13030/tf1z09n5r1/thumbnail \"Mario Savio addressing the crowd.\")\n", "\n", "http://content.cdlib.org/ark:/13030/tf1z09n5r1/hi-res.jpg ->\n", "http://digitalassets.lib.berkeley.edu/fsm/ucb/images/brk00040569b_c.jpg" ] }, { "cell_type": "code", "collapsed": false, "input": [ "urlparse.urlparse(\"http://digitalassets.lib.berkeley.edu/fsm/ucb/images/brk00040569b_c.jpg\").netloc" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 33, "text": [ "'digitalassets.lib.berkeley.edu'" ] } ], "prompt_number": 33 }, { "cell_type": "code", "collapsed": false, "input": [ "df_images.fsmImageUrl" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 34, "text": [ "0 [http://nma.berkeley.edu/ark:/28722/bk0005j9r9...\n", "1 [http://nma.berkeley.edu/ark:/28722/bk0005k284...\n", "2 [http://nma.berkeley.edu/ark:/28722/bk0005k2c2...\n", "3 [http://nma.berkeley.edu/ark:/28722/bk0005j9z5...\n", "4 [http://nma.berkeley.edu/ark:/28722/bk0005j9n7...\n", "5 [http://nma.berkeley.edu/ark:/28722/bk0005k1b6...\n", "6 [http://nma.berkeley.edu/ark:/28722/bk0005j9v3...\n", "7 [http://nma.berkeley.edu/ark:/28722/bk0005k0v2...\n", "8 [http://nma.berkeley.edu/ark:/28722/bk0005j9z1...\n", "9 [http://nma.berkeley.edu/ark:/28722/bk0005j9x7...\n", "10 [http://nma.berkeley.edu/ark:/28722/bk0005k232...\n", "11 [http://nma.berkeley.edu/ark:/28722/bk0005k047...\n", "12 [http://nma.berkeley.edu/ark:/28722/bk0005k1c8...\n", "13 [http://nma.berkeley.edu/ark:/28722/bk0005k110...\n", "14 [http://nma.berkeley.edu/ark:/28722/bk0005k276...\n", "...\n", "670 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "671 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "672 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "673 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "674 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "675 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "676 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "677 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "678 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "679 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "680 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "681 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "682 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "683 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "684 [http://digitalassets.lib.berkeley.edu/fsm/ucb...\n", "Name: fsmImageUrl, Length: 685, dtype: object" ] } ], "prompt_number": 34 }, { "cell_type": "code", "collapsed": false, "input": [ "# calculate hostnames for all image urls\n", "\n", "# might be possible to do this all with pandas\n", "netlocs = list(df_images.fsmImageUrl.dropna().apply(lambda urls: set([urlparse.urlparse(url).netloc for url in urls])))\n", "reduce(lambda x,y: x | y, netlocs, set())" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 35, "text": [ "{u'digitalassets.lib.berkeley.edu',\n", " u'nma.berkeley.edu',\n", " u'sunsite.berkeley.edu'}" ] } ], "prompt_number": 35 }, { "cell_type": "code", "collapsed": false, "input": [ "def len2(x):\n", " try:\n", " return len(x)\n", " except:\n", " return np.nan\n", " \n", "df_images.fsmImageUrl.apply(len2) == 3" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 36, "text": [ "0 False\n", "1 False\n", "2 False\n", "3 False\n", "4 False\n", "5 False\n", "6 False\n", "7 False\n", "8 False\n", "9 False\n", "10 False\n", "11 False\n", "12 False\n", "13 False\n", "14 False\n", "...\n", "670 False\n", "671 False\n", "672 False\n", "673 False\n", "674 False\n", "675 False\n", "676 False\n", "677 False\n", "678 False\n", "679 False\n", "680 False\n", "681 False\n", "682 False\n", "683 False\n", "684 False\n", "Name: fsmImageUrl, Length: 685, dtype: bool" ] } ], "prompt_number": 36 }, { "cell_type": "code", "collapsed": false, "input": [ "df_images[df_images.fsmImageUrl.apply(len2) == 3].head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fsmCreatorfsmDateCreatedfsmIdentifierfsmImageUrlfsmNotefsmPhysicalLocationfsmRelatedIdentifierfsmRelatedTitlefsmTitlefsmTypeOfResourceid
246 [Hecker, Ron] [Dec. 7, 1964] NaN [http://sunsite.berkeley.edu/FindingAids/dynaw... NaN NaN NaN [Free Speech Movement Photographs Collection, ] [Crowd in Sproul Plaza from Student Union balc... NaN UARC PIC 24B:2:22
247 [Hecker, Ron] [Dec. 7, 1964] NaN [http://sunsite.berkeley.edu/FindingAids/dynaw... NaN NaN NaN [Free Speech Movement Photographs Collection, ] [Crowd at Greek Theater] NaN UARC PIC 24B:2:17
248 NaN [Dec. 7, 1964] NaN [http://sunsite.berkeley.edu/FindingAids/dynaw... NaN NaN NaN [Free Speech Movement Photographs Collection, ] [View from inside Sproul Hall lobby looking th... NaN UARC PIC 24B:1:26
249 [Hecker, Ron] [Dec. 1964] NaN [http://sunsite.berkeley.edu/FindingAids/dynaw... NaN NaN NaN [Free Speech Movement Photographs Collection, ] [Student Strike] NaN UARC PIC 24B:2:6
250 [Hecker, Ron] [Dec. 7, 1964] NaN [http://sunsite.berkeley.edu/FindingAids/dynaw... NaN NaN NaN [Free Speech Movement Photographs Collection, ] [Crowd at Greek Theater] NaN UARC PIC 24B:2:21
\n", "

5 rows \u00d7 11 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 37, "text": [ " fsmCreator fsmDateCreated fsmIdentifier \\\n", "246 [Hecker, Ron] [Dec. 7, 1964] NaN \n", "247 [Hecker, Ron] [Dec. 7, 1964] NaN \n", "248 NaN [Dec. 7, 1964] NaN \n", "249 [Hecker, Ron] [Dec. 1964] NaN \n", "250 [Hecker, Ron] [Dec. 7, 1964] NaN \n", "\n", " fsmImageUrl fsmNote \\\n", "246 [http://sunsite.berkeley.edu/FindingAids/dynaw... NaN \n", "247 [http://sunsite.berkeley.edu/FindingAids/dynaw... NaN \n", "248 [http://sunsite.berkeley.edu/FindingAids/dynaw... NaN \n", "249 [http://sunsite.berkeley.edu/FindingAids/dynaw... NaN \n", "250 [http://sunsite.berkeley.edu/FindingAids/dynaw... NaN \n", "\n", " fsmPhysicalLocation fsmRelatedIdentifier \\\n", "246 NaN NaN \n", "247 NaN NaN \n", "248 NaN NaN \n", "249 NaN NaN \n", "250 NaN NaN \n", "\n", " fsmRelatedTitle \\\n", "246 [Free Speech Movement Photographs Collection, ] \n", "247 [Free Speech Movement Photographs Collection, ] \n", "248 [Free Speech Movement Photographs Collection, ] \n", "249 [Free Speech Movement Photographs Collection, ] \n", "250 [Free Speech Movement Photographs Collection, ] \n", "\n", " fsmTitle fsmTypeOfResource \\\n", "246 [Crowd in Sproul Plaza from Student Union balc... NaN \n", "247 [Crowd at Greek Theater] NaN \n", "248 [View from inside Sproul Hall lobby looking th... NaN \n", "249 [Student Strike] NaN \n", "250 [Crowd at Greek Theater] NaN \n", "\n", " id \n", "246 UARC PIC 24B:2:22 \n", "247 UARC PIC 24B:2:17 \n", "248 UARC PIC 24B:1:26 \n", "249 UARC PIC 24B:2:6 \n", "250 UARC PIC 24B:2:21 \n", "\n", "[5 rows x 11 columns]" ] } ], "prompt_number": 37 }, { "cell_type": "markdown", "metadata": {}, "source": [ "![a](http://sunsite.berkeley.edu/FindingAids/dynaweb/calher/fsm/figures/brk00038887a_a.gif \"a\")\n", "![b](http://sunsite.berkeley.edu/FindingAids/dynaweb/calher/fsm/figures/brk00038887a_b.jpg \"b\")\n", "![a](http://sunsite.berkeley.edu/FindingAids/dynaweb/calher/fsm/figures/brk00038887a_c.jpg \"c\")\n" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df_images[df_images.fsmImageUrl.apply(len2) == 4].ix[100].fsmImageUrl" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 38, "text": [ "[u'http://nma.berkeley.edu/ark:/28722/bk001532c4q',\n", " u'http://nma.berkeley.edu/ark:/28722/bk001532c7c',\n", " u'http://nma.berkeley.edu/ark:/28722/bk001532c58',\n", " u'http://nma.berkeley.edu/ark:/28722/bk001532c8x']" ] } ], "prompt_number": 38 }, { "cell_type": "code", "collapsed": false, "input": [ "IMAGES_TEMPLATE = \"\"\"\n", "
\n", " {% for item in items %}{% endfor %}\n", "
\n", "\"\"\"\n", " \n", "template = Template(IMAGES_TEMPLATE)\n", "HTML(template.render(items=df_images[df_images.fsmImageUrl.apply(len2) == 4].ix[100].fsmImageUrl )) " ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "\n", "
\n", " \n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 39, "text": [ "" ] } ], "prompt_number": 39 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Dates" ] }, { "cell_type": "code", "collapsed": false, "input": [ "len(df[~df.fsmDateCreated.isnull()])" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 40, "text": [ "486" ] } ], "prompt_number": 40 }, { "cell_type": "code", "collapsed": false, "input": [ "s = df[~df.fsmDateCreated.isnull()].fsmDateCreated.apply(len)==2 #.astype('datetime64[ns]')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 41 }, { "cell_type": "code", "collapsed": false, "input": [ "def first(x):\n", " try:\n", " return x[0]\n", " except:\n", " return np.nan\n", "\n", "\n", "df['calc_date'] = pd.to_datetime(df.fsmDateCreated.apply(first), coerce=True)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 42 }, { "cell_type": "code", "collapsed": false, "input": [ "df[~df.calc_date.isnull()].sort_index(by='calc_date').calc_date" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 43, "text": [ "156 1964-01-01\n", "90 1964-01-01\n", "74 1964-01-01\n", "14 1964-01-01\n", "146 1964-01-01\n", "731 1964-01-01\n", "92 1964-01-01\n", "167 1964-01-01\n", "300 1964-01-01\n", "220 1964-01-01\n", "871 1964-01-01\n", "203 1964-01-05\n", "261 1964-10-01\n", "245 1964-10-01\n", "243 1964-10-01\n", "...\n", "197 1970-05-03\n", "210 1970-05-03\n", "23 1970-05-05\n", "50 1970-05-05\n", "179 1970-05-05\n", "869 1973-01-01\n", "129 1984-10-02\n", "180 1984-10-02\n", "159 1984-10-02\n", "287 1984-10-02\n", "289 1984-10-02\n", "299 1984-10-02\n", "868 1986-01-01\n", "801 1990-01-01\n", "867 1993-06-03\n", "Name: calc_date, Length: 434, dtype: datetime64[ns]" ] } ], "prompt_number": 43 }, { "cell_type": "code", "collapsed": false, "input": [ "pd.to_datetime(df.fsmDateCreated.dropna().apply(lambda s:s[0]).astype('str'), coerce=True).dropna()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 44, "text": [ "0 1964-11-09\n", "1 1964-12-02\n", "2 1964-12-02\n", "3 1964-12-03\n", "4 1964-10-05\n", "5 1964-11-09\n", "6 1964-11-24\n", "7 1964-10-01\n", "8 1964-12-03\n", "9 1964-12-02\n", "10 1964-12-03\n", "11 1964-12-07\n", "12 1964-11-09\n", "13 1964-10-01\n", "14 1964-01-01\n", "...\n", "863 1965-07-26\n", "864 1965-10-13\n", "865 1965-03-05\n", "867 1993-06-03\n", "868 1986-01-01\n", "869 1973-01-01\n", "870 1965-01-03\n", "871 1964-01-01\n", "872 1964-11-30\n", "873 1964-12-04\n", "874 1964-12-22\n", "875 1965-01-07\n", "876 1964-12-21\n", "877 1965-01-09\n", "878 1965-01-02\n", "Name: fsmDateCreated, Length: 434, dtype: datetime64[ns]" ] } ], "prompt_number": 44 }, { "cell_type": "code", "collapsed": false, "input": [ "# http://stackoverflow.com/questions/17690738/in-pandas-how-do-i-convert-a-string-of-date-strings-to-datetime-objects-and-put\n", "date_stngs = ('2008-12-20','2008-12-21','2008-12-22','2008-12-23','Nov. 9, 1964', 'junk')\n", "pd.to_datetime(pd.Series(date_stngs),coerce=True)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 45, "text": [ "0 2008-12-20\n", "1 2008-12-21\n", "2 2008-12-22\n", "3 2008-12-23\n", "4 1964-11-09\n", "5 NaT\n", "dtype: datetime64[ns]" ] } ], "prompt_number": 45 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Types of Resources" ] }, { "cell_type": "code", "collapsed": false, "input": [ "def f(x):\n", " try:\n", " return set(x)\n", " except:\n", " return set()\n", " \n", "reduce(lambda x,y: x | y, df.fsmTypeOfResource.apply(f), set())\n", "\n" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 46, "text": [ "{u'Box 1:1',\n", " u'Box 1:11',\n", " u'Box 1:11:4',\n", " u'Box 1:13',\n", " u'Box 1:13:1',\n", " u'Box 1:13:4',\n", " u'Box 1:14',\n", " u'Box 1:15',\n", " u'Box 1:16',\n", " u'Box 1:17',\n", " u'Box 1:2',\n", " u'Box 1:25',\n", " u'Box 1:25:1',\n", " u'Box 1:25:4',\n", " u'Box 1:28',\n", " u'Box 1:29',\n", " u'Box 1:2:3',\n", " u'Box 1:30',\n", " u'Box 1:30:2',\n", " u'Box 1:32',\n", " u'Box 1:34',\n", " u'Box 1:34:1',\n", " u'Box 1:38',\n", " u'Box 1:39',\n", " u'Box 1:4',\n", " u'Box 1:41',\n", " u'Box 1:43',\n", " u'Box 1:44',\n", " u'Box 1:45',\n", " u'Box 1:46',\n", " u'Box 1:5',\n", " u'Box 1:6',\n", " u'Box 1:7',\n", " u'Box 1:8',\n", " u'Box 2:11',\n", " u'Box 2:11:1',\n", " u'Box 2:11:2',\n", " u'Box 2:11:3',\n", " u'Box 2:11:4',\n", " u'Box 2:11:6',\n", " u'Box 2:18',\n", " u'Box 2:18:1',\n", " u'Box 2:22',\n", " u'Box 2:22:1',\n", " u'Box 2:47',\n", " u'Box 2:47:1',\n", " u'Box 2:47:3',\n", " u'Box 2:49',\n", " u'Box 2:49:1',\n", " u'Box 2:49:2',\n", " u'Box 2:55',\n", " u'Box 2:59',\n", " u'Box 2:8',\n", " u'Box 2:8:3',\n", " u'Box 2:8:4',\n", " u'Box 3:1',\n", " u'Box 3:11',\n", " u'Box 3:14',\n", " u'Box 3:14:1',\n", " u'Box 3:15',\n", " u'Box 3:17',\n", " u'Box 3:17:2',\n", " u'Box 3:2',\n", " u'Box 3:21',\n", " u'Box 3:22',\n", " u'Box 3:23',\n", " u'Box 3:26',\n", " u'Box 3:29',\n", " u'Box 3:29:1',\n", " u'Box 3:29:2',\n", " u'Box 3:3',\n", " u'Box 3:31',\n", " u'Box 3:33',\n", " u'Box 3:34',\n", " u'Box 3:34:1',\n", " u'Box 3:36',\n", " u'Box 3:38',\n", " u'Box 3:39',\n", " u'Box 3:39:4',\n", " u'Box 3:39:5',\n", " u'Box 3:39:8',\n", " u'Box 3:40',\n", " u'Box 3:41',\n", " u'Box 3:5',\n", " u'Box 4:10',\n", " u'Box 4:5',\n", " u'Box 4:5:13',\n", " u'Box 4:5:2',\n", " u'Box 4:5:6',\n", " u'Box 4:8',\n", " u'Box 4:8:5',\n", " u'Box 4:9',\n", " u'Box 4:9:3',\n", " u'Box 70:33',\n", " u'Box 70:33:2',\n", " u'Box 70:33:4',\n", " u'Box 70:34',\n", " u'Box 70:34:1',\n", " u'Box 70:34:3',\n", " u'Box 70:34:7c',\n", " u'Box 70:34:8',\n", " u'Box 72:14',\n", " u'Box 72:14:1',\n", " u'Box 72:14:11',\n", " u'Box 72:14:19',\n", " u'Box 72:23',\n", " u'Box 72:23:1',\n", " u'Carton 1:12',\n", " u'Carton 1:12:2',\n", " u'Carton 1:12:3',\n", " u'Carton 1:12:4',\n", " u'Carton 1:12:5',\n", " u'Carton 1:12:6',\n", " u'Carton 1:12:7',\n", " u'Carton 1:12:8',\n", " u'Carton 1:14',\n", " u'Carton 1:15',\n", " u'Carton 1:9',\n", " u'Carton 21:14:1',\n", " u'Carton 21:14:7',\n", " u'Carton 21:16',\n", " u'Carton 21:2:1',\n", " u'Carton 2:20',\n", " u'Carton 2:32',\n", " u'Carton 3:16',\n", " u'Carton 3:37',\n", " u'Carton 3:58',\n", " u'Carton 3:58:4',\n", " u'Carton 3:58:7',\n", " u'Carton 4:32',\n", " u'Carton 4:78',\n", " u'Carton 4:80',\n", " u'agendas',\n", " u'articles',\n", " u'briefs (legal documents)',\n", " u'detail',\n", " u'fdr',\n", " u'fliers (printed matter)',\n", " u'folder',\n", " u'form letters',\n", " u'group statements',\n", " u'item',\n", " u'leaflets',\n", " u'letters (correspondence)',\n", " u'magazines (periodicals)',\n", " u'memorandums',\n", " u'minutes',\n", " u'miscellaneous',\n", " u'news bulletins',\n", " u'newsletters',\n", " u'newspapers',\n", " u'oral histories',\n", " u'pamphlets',\n", " u'papers (document genres)',\n", " u'personal statement',\n", " u'personal statements',\n", " u'progress reports',\n", " u'reports',\n", " u'still image',\n", " u'tables of content',\n", " u'text',\n", " u'title pages',\n", " u'transcripts'}" ] } ], "prompt_number": 46 }, { "cell_type": "code", "collapsed": false, "input": [ "#related id\n", "\n", "len(df.fsmRelatedIdentifier.dropna())" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 47, "text": [ "236" ] } ], "prompt_number": 47 }, { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "TEI documents" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df.fsmTeiUrl.dropna()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 48, "text": [ "685 [http://content.cdlib.org/xml/ark:/13030/kt5m3...\n", "686 [http://content.cdlib.org/xml/ark:/13030/kt5s2...\n", "687 [http://content.cdlib.org/xml/ark:/13030/kt6k4...\n", "688 [http://content.cdlib.org/xml/ark:/13030/kt4s2...\n", "689 [http://content.cdlib.org/xml/ark:/13030/kt1h4...\n", "690 [http://content.cdlib.org/xml/ark:/13030/kt2w1...\n", "691 [http://content.cdlib.org/xml/ark:/13030/kt609...\n", "692 [http://content.cdlib.org/xml/ark:/13030/kt638...\n", "693 [http://content.cdlib.org/xml/ark:/13030/kt777...\n", "694 [http://content.cdlib.org/xml/ark:/13030/kt0k4...\n", "695 [http://content.cdlib.org/xml/ark:/13030/kt6m3...\n", "696 [http://content.cdlib.org/xml/ark:/13030/kt287...\n", "697 [http://content.cdlib.org/xml/ark:/13030/kt3p3...\n", "698 [http://content.cdlib.org/xml/ark:/13030/kt177...\n", "699 [http://content.cdlib.org/xml/ark:/13030/kt1g5...\n", "...\n", "864 [http://content.cdlib.org/xml/ark:/13030/kt3z0...\n", "865 [http://content.cdlib.org/xml/ark:/13030/kt5h4...\n", "866 [http://content.cdlib.org/xml/ark:/13030/kt1v1...\n", "867 [http://content.cdlib.org/xml/ark:/13030/kt7d5...\n", "868 [http://content.cdlib.org/xml/ark:/13030/kt7h4...\n", "869 [http://content.cdlib.org/xml/ark:/13030/kt919...\n", "870 [http://content.cdlib.org/xml/ark:/13030/kt409...\n", "871 [http://content.cdlib.org/xml/ark:/13030/kt4c6...\n", "872 [http://content.cdlib.org/xml/ark:/13030/kt387...\n", "873 [http://content.cdlib.org/xml/ark:/13030/kt3q2...\n", "874 [http://content.cdlib.org/xml/ark:/13030/kt7v1...\n", "875 [http://content.cdlib.org/xml/ark:/13030/kt038...\n", "876 [http://content.cdlib.org/xml/ark:/13030/kt7z0...\n", "877 [http://content.cdlib.org/xml/ark:/13030/kt500...\n", "878 [http://content.cdlib.org/xml/ark:/13030/kt9b6...\n", "Name: fsmTeiUrl, Length: 194, dtype: object" ] } ], "prompt_number": 48 } ], "metadata": {} } ] }