{ "metadata": { "name": "", "signature": "sha256:83e7d6e5dcd90971f44cc4f0faca1b9da2b2c02d27295665e39f274dda001285" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "# coding: utf-8\n", "\n", "import unicodedata\n", "import string\n", "import hashlib\n", "\n", "def uber_hasher(listoffunctions, listofmetadata):\n", " metahashlist = []\n", " for metadata in listofmetadata:\n", " listofhashes = [] \n", " for function in listoffunctions:\n", " listofhashes.append(normalize_string(function(metadata)))\n", " metahashlist.append(listofhashes)\n", " return(metahashlist) # a list of lists; each internal list goes w/ a chunk of metadata\n", "\n", "def normalize_string(astring): # takes a unicode string or a list of strings\n", " # docs.python.org/2/library/unicodedata.html\n", " # TODO: this may not work for some unicode characters; I dealt w/\n", " # special cases I know about; are there others?\n", " # (when it fails to transliterate, it replaces with '')\n", " astring = astring.replace(u'\u00e6',u'ae')\n", " astring = astring.replace(u'\u00c6',u'Ae')\n", " astring = astring.replace(u'\u00df', u'ss') # assumes good transliteration\n", " bstring = unicodedata.normalize('NFKD', astring).encode('ascii','ignore')\n", " bstring = bstring.lower()\n", " exclude = set(string.punctuation)\n", " exclude.add(' ')\n", " exclude.add('\\n')\n", " bstring = ''.join(ch for ch in bstring if ch not in exclude)\n", " bstring = hashlib.md5(bstring).hexdigest()\n", " return bstring # returns a hash of the string or list\n", "\n", "def grab_title(metadata): # takes a dictionary\n", " title = metadata['title']\n", " return title # returns a unicode string\n", "\n", "def grab_description(metadata): #takes a dictionary\n", " description = metadata['description']\n", " return description # returns a unicode string, possibly VERY long\n", "\n", "def grab_contributors(metadata): # takes a dictionary\n", " contributors = metadata['contributors'] # this is a list\n", " namelist = ''\n", " for contributor in contributors:\n", " # strip middle names/initials - not going to work for honorifics, degrees\n", " # can we please just have surname and givenname split out?\n", " name = contributor['name'].split()\n", " fullname = name[0] + name[len(name)-1]\n", " namelist += fullname\n", " return namelist # returns a list of strings\n", " \n", "if __name__ == '__main__':\n", " longtext = '''Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots\n", " in a piece of classical Latin literature from 45 BC, making it over 2000 years old. \n", " Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one\n", " of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going \n", " through the cites of the word in classical literature, discovered the undoubtable source. \n", " Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of \"de Finibus Bonorum et Malorum\" \n", " (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on \n", " the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, \n", " \"Lorem ipsum dolor sit amet..\", comes from a line in section 1.10.32.'''\n", "\n", " somedata = [{\n", " 'title': u'OMG Penguins! Who\u00f6dly whoodly w\u00fb!',\n", " 'contributors': [{ \n", " 'name': u'Mr. Popper',\n", " 'email': u'pop@popper.pop',\n", " }, {\n", " 'name': u'Ab\u00e6c\u016bs E F\u012fnch',\n", " 'email': u'',\n", " }],\n", " 'description': u'What if a much of a which of a wind gave truth to the summer\\'s lie? Kvothe the Raven nevermore.',\n", " }, \n", " {\n", " 'title': u'All ab\u00f6\u00fct Ducks, yeah buddy; du\u00e7ks are amazing.',\n", " 'contributors': [{ \n", " 'name': u'\u00c1lbert D\u016bmbl\u00e9dore',\n", " 'email': u'wizard@wizardingschool.wiz',\n", " }, {\n", " 'name': u'Albemarle K. J\u00f4\u00f1es',\n", " 'email': u'yeah@maps.yeah',\n", " },\n", " {\n", " 'name': u'\u00c6dg\u00e4r E. Cumm\u00edngs',\n", " 'email': u'poets@poetry.poem',\n", " },\n", " ],\n", " 'description': unicode(longtext),\n", " }\n", " ]\n", "\n", " print uber_hasher([grab_title, grab_description, grab_contributors], somedata)\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[['98fb719df90db9c9478a47ebd5fa0570', '49a7c80bca61edaed3db66abaa418e97', '1047004c538a737cdca90e7e7c6ad459'], ['30c724da90a54b1dba598da502fa62ea', '3bda0e7a04a0623b73c2f4ea0c9557a9', 'b4ae654d1e4ddd1f31ad00e11209addb']]\n" ] } ], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }