{
 "metadata": {
  "name": "",
  "signature": "sha256:83e7d6e5dcd90971f44cc4f0faca1b9da2b2c02d27295665e39f274dda001285"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# coding: utf-8\n",
      "\n",
      "import unicodedata\n",
      "import string\n",
      "import hashlib\n",
      "\n",
      "def uber_hasher(listoffunctions, listofmetadata):\n",
      "    metahashlist = []\n",
      "    for metadata in listofmetadata:\n",
      "        listofhashes = []    \n",
      "        for function in listoffunctions:\n",
      "            listofhashes.append(normalize_string(function(metadata)))\n",
      "        metahashlist.append(listofhashes)\n",
      "    return(metahashlist) # a list of lists; each internal list goes w/ a chunk of metadata\n",
      "\n",
      "def normalize_string(astring): # takes a unicode string or a list of strings\n",
      "    # docs.python.org/2/library/unicodedata.html\n",
      "    # TODO: this may not work for some unicode characters; I dealt w/\n",
      "    # special cases I know about; are there others?\n",
      "    # (when it fails to transliterate, it replaces with '')\n",
      "    astring = astring.replace(u'\u00e6',u'ae')\n",
      "    astring = astring.replace(u'\u00c6',u'Ae')\n",
      "    astring = astring.replace(u'\u00df', u'ss') # assumes good transliteration\n",
      "    bstring = unicodedata.normalize('NFKD', astring).encode('ascii','ignore')\n",
      "    bstring = bstring.lower()\n",
      "    exclude = set(string.punctuation)\n",
      "    exclude.add(' ')\n",
      "    exclude.add('\\n')\n",
      "    bstring = ''.join(ch for ch in bstring if ch not in exclude)\n",
      "    bstring = hashlib.md5(bstring).hexdigest()\n",
      "    return bstring # returns a hash of the string or list\n",
      "\n",
      "def grab_title(metadata): # takes a dictionary\n",
      "    title = metadata['title']\n",
      "    return title  # returns a unicode string\n",
      "\n",
      "def grab_description(metadata): #takes a dictionary\n",
      "    description = metadata['description']\n",
      "    return description # returns a unicode string, possibly VERY long\n",
      "\n",
      "def grab_contributors(metadata): # takes a dictionary\n",
      "    contributors = metadata['contributors'] # this is a list\n",
      "    namelist = ''\n",
      "    for contributor in contributors:\n",
      "        # strip middle names/initials - not going to work for honorifics, degrees\n",
      "        # can we please just have surname and givenname split out?\n",
      "        name = contributor['name'].split()\n",
      "        fullname = name[0] + name[len(name)-1]\n",
      "        namelist += fullname\n",
      "    return namelist # returns a list of strings\n",
      "    \n",
      "if __name__ == '__main__':\n",
      "    longtext = '''Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots\n",
      "        in a piece of classical Latin literature from 45 BC, making it over 2000 years old. \n",
      "        Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one\n",
      "        of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going \n",
      "        through the cites of the word in classical literature, discovered the undoubtable source. \n",
      "        Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of \"de Finibus Bonorum et Malorum\" \n",
      "        (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on \n",
      "        the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, \n",
      "        \"Lorem ipsum dolor sit amet..\", comes from a line in section 1.10.32.'''\n",
      "\n",
      "    somedata = [{\n",
      "        'title': u'OMG Penguins! Who\u00f6dly whoodly w\u00fb!',\n",
      "        'contributors': [{ \n",
      "            'name': u'Mr. Popper',\n",
      "            'email': u'pop@popper.pop',\n",
      "        }, {\n",
      "            'name': u'Ab\u00e6c\u016bs E F\u012fnch',\n",
      "            'email': u'',\n",
      "        }],\n",
      "        'description': u'What if a much of a which of a wind gave truth to the summer\\'s lie? Kvothe the Raven nevermore.',\n",
      "    }, \n",
      "        {\n",
      "        'title': u'All ab\u00f6\u00fct Ducks, yeah buddy; du\u00e7ks are amazing.',\n",
      "        'contributors': [{ \n",
      "            'name': u'\u00c1lbert D\u016bmbl\u00e9dore',\n",
      "            'email': u'wizard@wizardingschool.wiz',\n",
      "        }, {\n",
      "            'name': u'Albemarle K. J\u00f4\u00f1es',\n",
      "            'email': u'yeah@maps.yeah',\n",
      "        },\n",
      "           {\n",
      "            'name': u'\u00c6dg\u00e4r E. Cumm\u00edngs',\n",
      "            'email': u'poets@poetry.poem',\n",
      "        },\n",
      "        ],\n",
      "        'description': unicode(longtext),\n",
      "    }\n",
      "    ]\n",
      "\n",
      "    print uber_hasher([grab_title, grab_description, grab_contributors], somedata)\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[['98fb719df90db9c9478a47ebd5fa0570', '49a7c80bca61edaed3db66abaa418e97', '1047004c538a737cdca90e7e7c6ad459'], ['30c724da90a54b1dba598da502fa62ea', '3bda0e7a04a0623b73c2f4ea0c9557a9', 'b4ae654d1e4ddd1f31ad00e11209addb']]\n"
       ]
      }
     ],
     "prompt_number": 10
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}