{ "metadata": { "celltoolbar": "Slideshow", "name": "", "signature": "sha256:3ce4ed97f4360f8808db2ac33c323e0de378e256a80971e49f5aaa70bc541749" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import os" ], "language": "python", "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "creds = {\n", " 'twitter': (os.environ['TW_API_KEY'], os.environ['TW_API_SEC'], (os.environ['TW_ACC_KEY'], os.environ['TW_ACC_SEC'])),\n", " 'facebook': os.environ['FB_API_KEY'],\n", " 'google': os.environ['GG_API_KEY']\n", "}" ], "language": "python", "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "prompt_number": 2 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Pattern" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import pattern" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "## What's in it?\n", "\n", "* pattern.web: data mining fanciness\n", "* pattern.db: \"ORM\"-ish\n", " * simpler interface, tabular data\n", " * csv handling\n", "* pattern.{en, es, de, fr, it, nl}: languages!\n", " * PoS-tagger\n", " * \"sentiment analysis\"\n", " * verb conjugation, pluralisation, etc.\n", "* pattern.search: search by syntax / semantics\n", "* pattern.vector: clustering, classification, etc.\n", "* pattern.graph: graph analysis!\n", "* pattern.metrics: grab-bag of useful tools for dealing with language" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "## Is it any good?\n", "\n", "Kinda. But it's not perfect.\n", "\n", "Individual modules are obviously less flexible than unipurpose modules\n", "\n", "Nice collection of useful tools if you're dealing with anything related to linguistic processing" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# pattern.web" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.web import Twitter, Facebook, DuckDuckGo, Google, Bing, Wikipedia, Wikia, Newsfeed\n", "from pattern.web import SEARCH, NEWS, IMAGE, SPARQL, COMMENTS, LIKES" ], "language": "python", "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "tw = Twitter(license=creds['twitter'])\n", "fb = Facebook(license=creds['facebook'])\n", "ggl = Google(license=creds['google'])\n", "# None uses credentials shared across ALL pattern users\n", "ddg = DuckDuckGo(license=None)\n", "bing = Bing(license=None)\n", "wp = Wikipedia(license=None)\n", "wa = Wikia(license=None)\n", "nf = Newsfeed(license=None) # RSS / atom" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [], "prompt_number": 5 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Twitter" ] }, { "cell_type": "code", "collapsed": false, "input": [ "tw.trends()" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 6, "text": [ "[u'#HaftasonuRandevusuzOyVermekistiyorumYSK',\n", " u'#muratgogebakan',\n", " u'#ThankYouBestfriend',\n", " u'#amnesiaEP',\n", " u'#BamOfficialChartThisSunday',\n", " u'One Direction #BestFandom2014 Directioners',\n", " u'Justin Bieber #BestFandom2014 Beliebers',\n", " u'RIP Sir Bobby Robson',\n", " u'Anabel',\n", " u'Feliz Jueves']" ] } ], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "twitter_result = tw.search('ananas', count=100)\n", "twitter_result[0]" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 7, "text": [ "Result({u'profile': u'http://pbs.twimg.com/profile_images/486028208689905664/r-vzT-qS_normal.jpeg', u'language': u'ru', u'author': u'shikari_ananas', u'url': u'https://twitter.com/shikari_ananas/status/494802120152207360', u'text': u'@nnnoda \\u0432\\u0441\\u0435 \\u0444\\u0430\\u043d\\u044e\\u0447\\u043a\\u0438 \\u043d\\u043e\\u0434\\u044b \\u0442\\u0430\\u043a\\u0438\\u0435', u'date': u'Thu Jul 31 11:09:53 +0000 2014', u'id': u'494802120152207360'})" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "from collections import Counter\n", "languages = [tweet.language for tweet in twitter_result]\n", "Counter(languages)" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 8, "text": [ "Counter({u'ru': 60, u'fr': 12, u'en': 7, u'pt': 6, u'tr': 4, u'de': 3, u'it': 3, u'nl': 1, u'vi': 1, u'da': 1, u'tl': 1, u'in': 1})" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.web.locale import geocode\n", "\n", "in_result = tw.search('#EngvInd', count=50, geo=geocode('New Delhi')[:2])\n", "en_result = tw.search('#EngvInd', count=50, geo=geocode('London')[:2])" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.en import sentiment" ], "language": "python", "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "reduce(lambda x, y: x+y, [sentiment(t.text)[0] for t in in_result]) / len(in_result)" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 11, "text": [ "0.14291278166278154" ] } ], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "reduce(lambda x, y: x+y, [sentiment(t.text)[0] for t in en_result]) / len(en_result)" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 12, "text": [ "0.12380667249417247" ] } ], "prompt_number": 12 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Facebook" ] }, { "cell_type": "code", "collapsed": false, "input": [ "fb_result = fb.search('dragon age inquisition', type=SEARCH, count=100)\n", "fb_result[0]" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 13, "text": [ "Result(id=u'449022588487492_753024791420602')" ] } ], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "for post in fb_result:\n", " \n", " if post.likes > 0 and post.comments > 0:\n", " print('\\n\\n'.join( [l.author[1] for l in fb.search(post.id, type=LIKES)] ))\n", " print('-' * 10)\n", " print('\\n\\n'.join( [c.text for c in fb.search(post.id, type=COMMENTS)] ))\n", " break" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\u0178\u00f8\u00fc\u010d\u0113f F\u014d\u00fc\u012bn\u00ff\n", "\n", "Sawako Heiwajima\n", "\n", "Rex Larbi Poochie\n", "\n", "Iss\u00e2m Phoneix\n", "\n", "\u0645\u0648\u0644 \u0627\u0644\u0627\u064a\u0631 \u0645\u0627\u0643\u0633\n", "----------\n", "ET GTA V sur PC ?? c'est pour quand ? :(" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 14 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Google, Bing, DuckDuckGo" ] }, { "cell_type": "code", "collapsed": false, "input": [ "google_result = ggl.search('dragon age inquisition', count=10)\n", "google_result[0]" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 15, "text": [ "Result({u'url': u'http://www.dragonage.com/', u'text': u'Beautiful vistas and incredible new possibilities await you in the latest game in
\\nthe epic role-playing series from BioWare \\u2013 Dragon Age: Inquisition.', u'date': u'', u'language': u'', u'title': u'Dragon Age: Inquisition'})" ] } ], "prompt_number": 15 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "# Google Translate" ] }, { "cell_type": "code", "collapsed": false, "input": [ "lang_id = [ggl.identify(res.text) for res in fb_result[:10]]\n", "lang_id" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 16, "text": [ "[(u'en', 0.9998242),\n", " (u'en', 0.120250024),\n", " (u'en', 0.6363636),\n", " (u'en', 0.033820875),\n", " (u'en', 0.2889652),\n", " (u'en', 0.6282051)]" ] } ], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "ggl.translate(fb_result[0].text, input=lang_id[0][0], output='de')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 17, "text": [ "u'Dragon Age: Inquisition Kampf Anh\\xe4nger burninates dem Land einen neuen Trailer zu Dragon Age: Inquisition nicht bieten neue Informationen \\xfcber das Spiel der Kampf, aber es ist ermutigend zu BioWare Fokus auf der Ebene der Strategie, die von der Echtzeit-mit-Pause zu sehen System. - http://www.rheena.com/reviews-games-chat-hardware-sony/212236-dragon-age-inquisition-combat-trailer-burninates-countryside.html'" ] } ], "prompt_number": 17 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Finding emails & links" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.web import find_urls, find_email\n", "\n", "s = '''\n", "Find out more at the PUGS website (http://pugs.org.sg), or email us at idontknow@whatouremail.is!\n", "This is a decoy URL http://pugs.org.sg.\n", "'''" ], "language": "python", "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [ "find_urls(s, unique=True), \\\n", "find_email(s, unique=True)" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 19, "text": [ "([u'http://pugs.org.sg'], [u'idontknow@whatouremail.is'])" ] } ], "prompt_number": 19 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Crowdsourced \"useful\" sorting" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.web import sort, GOOGLE\n", "\n", "terms = [\n", " 'french',\n", " 'german',\n", " 'japanese',\n", " 'chinese',\n", " 'persian',\n", " 'hun',\n", " 'american',\n", " 'russian',\n", " 'swede',\n", " 'polish',\n", " 'singaporean',\n", " 'politician',\n", "]\n", "sort_result = sort(terms=terms, context='dangerous', prefix=True, service=GOOGLE, license=creds['google'])" ], "language": "python", "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [], "prompt_number": 20 }, { "cell_type": "code", "collapsed": false, "input": [ "for weight, term in sort_result:\n", " print \"%.2f\" % (weight * 100) + '%', term" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "28.50% \"dangerous american\"\n", "21.18% \"dangerous russian\"\n", "11.85% \"dangerous japanese\"\n", "11.22% \"dangerous chinese\"\n", "10.10% \"dangerous politician\"\n", "8.99% \"dangerous german\"\n", "6.00% \"dangerous french\"\n", "1.12% \"dangerous polish\"\n", "0.53% \"dangerous hun\"\n", "0.33% \"dangerous persian\"\n", "0.17% \"dangerous swede\"\n", "0.01% \"dangerous singaporean\"\n" ] } ], "prompt_number": 21 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Other miscellaneous functionality\n", "\n", "* DOM parser\n", "* HTML -> plaintext\n", "* PDF -> plaintext\n", "* Crawler\n", "* IMAP (!!!)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# pattern.en" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Manipulation" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.en import article, referenced\n", "\n", "article('harbour'), \\\n", "referenced('umbrella')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 22, "text": [ "('a', 'an umbrella')" ] } ], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.en import (pluralize as pluralise, singularize as singularise)" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "prompt_number": 23 }, { "cell_type": "code", "collapsed": false, "input": [ "pluralise('octopus')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 24, "text": [ "'octopodes'" ] } ], "prompt_number": 24 }, { "cell_type": "code", "collapsed": false, "input": [ "pluralise('octopus', classical=False)" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 25, "text": [ "'octopuses'" ] } ], "prompt_number": 25 }, { "cell_type": "code", "collapsed": false, "input": [ "pluralise('I'), \\\n", "pluralise('my'), \\\n", "pluralise('her')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 26, "text": [ "('we', 'our', 'their')" ] } ], "prompt_number": 26 }, { "cell_type": "code", "collapsed": false, "input": [ "singularise('bacteria')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 27, "text": [ "'bacterium'" ] } ], "prompt_number": 27 }, { "cell_type": "code", "collapsed": false, "input": [ "pluralise('virus')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 28, "text": [ "'viruss'" ] } ], "prompt_number": 28 }, { "cell_type": "code", "collapsed": false, "input": [ "singularise('viruses'), \\\n", "singularise('virii'), \\\n", "singularise('virus')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 29, "text": [ "('viruse', 'virius', 'viru')" ] } ], "prompt_number": 29 }, { "cell_type": "code", "collapsed": false, "input": [ "singularise('viri')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 30, "text": [ "'virus'" ] } ], "prompt_number": 30 }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.en import comparative, superlative" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "prompt_number": 31 }, { "cell_type": "code", "collapsed": false, "input": [ "'python is %s than ruby!' % comparative('good')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 32, "text": [ "'python is better than ruby!'" ] } ], "prompt_number": 32 }, { "cell_type": "code", "collapsed": false, "input": [ "'iPython is %s python shell' % (\n", "referenced(superlative('ideal'), article=pattern.en.DEFINITE))" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 33, "text": [ "'iPython is the most ideal python shell'" ] } ], "prompt_number": 33 }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.en import conjugate, lemma, lexeme, tenses" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "prompt_number": 34 }, { "cell_type": "code", "collapsed": false, "input": [ "lemma('are')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 35, "text": [ "u'be'" ] } ], "prompt_number": 35 }, { "cell_type": "code", "collapsed": false, "input": [ "tenses('be'), \\\n", "tenses('were')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 36, "text": [ "([('infinitive', None, None, None, None),\n", " ('present', None, 'plural', 'indicative', 'imperfective'),\n", " ('present', 1, 'plural', 'indicative', 'imperfective'),\n", " ('present', 1, 'singular', 'indicative', 'imperfective'),\n", " ('present', 2, 'plural', 'indicative', 'imperfective'),\n", " ('present', 2, 'singular', 'indicative', 'imperfective'),\n", " ('present', 3, 'plural', 'indicative', 'imperfective'),\n", " ('present', 3, 'singular', 'indicative', 'imperfective')],\n", " [('past', None, None, 'indicative', 'imperfective'),\n", " ('past', None, 'plural', 'indicative', 'imperfective'),\n", " ('past', 1, 'plural', 'indicative', 'imperfective'),\n", " ('past', 1, 'singular', 'indicative', 'imperfective'),\n", " ('past', 2, 'plural', 'indicative', 'imperfective'),\n", " ('past', 2, 'singular', 'indicative', 'imperfective'),\n", " ('past', 3, 'plural', 'indicative', 'imperfective'),\n", " ('past', 3, 'singular', 'indicative', 'imperfective')])" ] } ], "prompt_number": 36 }, { "cell_type": "code", "collapsed": false, "input": [ "lexeme('be')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 37, "text": [ "[u'be',\n", " u'am',\n", " u'are',\n", " u'is',\n", " u'being',\n", " u'was',\n", " u'were',\n", " u'been',\n", " u'am not',\n", " u\"aren't\",\n", " u\"isn't\",\n", " u\"wasn't\",\n", " u\"weren't\"]" ] } ], "prompt_number": 37 }, { "cell_type": "code", "collapsed": false, "input": [ "conjugate('nibble', '1sgp'), \\\n", "conjugate('nibble', '3sg')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 38, "text": [ "(u'nibbled', u'nibbles')" ] } ], "prompt_number": 38 }, { "cell_type": "code", "collapsed": false, "input": [ "conjugate('google', tense=pattern.en.PARTICIPLE, parse=False), \\\n", "conjugate('google', tense=pattern.en.PARTICIPLE, parse=True)" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 39, "text": [ "(None, 'googling')" ] } ], "prompt_number": 39 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Numbers!" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.en import number\n", "\n", "number('five thousand six hundred and eighty nine')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 40, "text": [ "5689" ] } ], "prompt_number": 40 }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.en import numerals\n", "\n", "numerals('42.128', round=2)" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 41, "text": [ "'forty-two point thirteen'" ] } ], "prompt_number": 41 }, { "cell_type": "code", "collapsed": false, "input": [ "animals = ['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken']\n", "orangebirds = {'carrot': 100, 'parrot': 5, 'orange': 20}" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "prompt_number": 42 }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.en import quantify\n", "\n", "quantify(animals), \\\n", "quantify(orangebirds)" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 43, "text": [ "('several chickens, a pair of geese and a duck',\n", " 'dozens of carrots, a score of oranges and several parrots')" ] } ], "prompt_number": 43 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Autocorrect\u2026 ish" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.en import suggest\n", "\n", "suggest('psuh')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 44, "text": [ "[('push', 1.0)]" ] } ], "prompt_number": 44 }, { "cell_type": "code", "collapsed": false, "input": [ "suggest('carot')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 45, "text": [ "[('cart', 0.9032258064516129),\n", " ('cabot', 0.04838709677419355),\n", " ('carrot', 0.03225806451612903),\n", " ('caret', 0.016129032258064516)]" ] } ], "prompt_number": 45 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# n-grams" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.en import ngrams" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "prompt_number": 46 }, { "cell_type": "code", "collapsed": false, "input": [ "ngrams('This is a sentence', n=2)" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 47, "text": [ "[('This', 'is'), ('is', 'a'), ('a', 'sentence')]" ] } ], "prompt_number": 47 }, { "cell_type": "code", "collapsed": false, "input": [ "ngrams('This is a sentence', n=3)" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 48, "text": [ "[('This', 'is', 'a'), ('is', 'a', 'sentence')]" ] } ], "prompt_number": 48 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Parsing" ] }, { "cell_type": "code", "collapsed": false, "input": [ "short_s = 'When I saw the prices for some of those apartments, I was startled.'\n", "long_s = 'Personally, I think the only unassailable definition is the one often attributed to the great editor John W Campbell: \"Science fiction is what I say it is.\"'" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "prompt_number": 49 }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.en import tag\n", "\n", "for word, pos in tag(short_s):\n", " if pos[:2] == 'NN':\n", " print(word)" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "prices\n", "apartments\n" ] } ], "prompt_number": 50 }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.en import parse\n", "\n", "parsed = parse(short_s, tokenize=True, tags=True, chunks=True, relations=True, lemmata=True)\n", "parsed" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 51, "text": [ "u'When/WRB/O/O/O/when I/PRP/B-NP/O/NP-SBJ-1/i saw/VBD/B-VP/O/VP-1/see the/DT/B-NP/O/NP-OBJ-1/the prices/NNS/I-NP/O/NP-OBJ-1/price for/IN/B-PP/O/O/for some/DT/O/O/O/some of/IN/B-PP/B-PNP/O/of those/DT/B-NP/I-PNP/O/those apartments/NNS/I-NP/I-PNP/O/apartment ,/,/O/O/O/, I/PRP/B-NP/O/NP-SBJ-2/i was/VBD/B-VP/O/VP-2/be startled/VBN/I-VP/O/VP-2/startle ././O/O/O/.'" ] } ], "prompt_number": 51 }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.en import pprint\n", "\n", "pprint(parsed)" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " WORD TAG CHUNK ROLE ID PNP LEMMA \n", " \n", " When WRB - - - - when \n", " I PRP NP SBJ 1 - i \n", " saw VBD VP - 1 - see \n", " the DT NP OBJ 1 - the \n", " prices NNS NP ^ OBJ 1 - price \n", " for IN PP - - - for \n", " some DT - - - - some \n", " of IN PP - - PNP of \n", " those DT NP - - PNP those \n", " apartments NNS NP ^ - - PNP apartment \n", " , , - - - - , \n", " I PRP NP SBJ 2 - i \n", " was VBD VP - 2 - be \n", " startled VBN VP ^ - 2 - startle \n", " . . - - - - . \n" ] } ], "prompt_number": 52 }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.en import parsetree\n", "\n", "parsetree(short_s, tokenize=True, tags=True, chunks=True, relations=True, lemmata=True)" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 53, "text": [ "[Sentence('When/WRB/O/O/O/when I/PRP/B-NP/O/NP-SBJ-1/i saw/VBD/B-VP/O/VP-1/see the/DT/B-NP/O/NP-OBJ-1/the prices/NNS/I-NP/O/NP-OBJ-1/price for/IN/B-PP/O/O/for some/DT/O/O/O/some of/IN/B-PP/B-PNP/O/of those/DT/B-NP/I-PNP/O/those apartments/NNS/I-NP/I-PNP/O/apartment ,/,/O/O/O/, I/PRP/B-NP/O/NP-SBJ-2/i was/VBD/B-VP/O/VP-2/be startled/VBN/I-VP/O/VP-2/startle ././O/O/O/.')]" ] } ], "prompt_number": 53 }, { "cell_type": "code", "collapsed": false, "input": [ "tr = parsetree(long_s, tokenize=True, tags=True, chunks=True, relations=True, lemmata=True)\n", "\n", "print(type(tr))" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 54 }, { "cell_type": "code", "collapsed": false, "input": [ "[type(item) for item in tr]" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 55, "text": [ "[pattern.text.tree.Sentence, pattern.text.tree.Sentence]" ] } ], "prompt_number": 55 }, { "cell_type": "code", "collapsed": false, "input": [ "type(tr[0][0])" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 56, "text": [ "pattern.text.tree.Word" ] } ], "prompt_number": 56 }, { "cell_type": "code", "collapsed": false, "input": [ "for sentence in tr:\n", " for chunk in sentence.chunks:\n", " print(chunk.type, [(w.string, w.type) for w in chunk.words])" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "(u'ADVP', [(u'Personally', u'RB')])\n", "(u'NP', [(u'I', u'PRP')])\n", "(u'VP', [(u'think', u'VBP')])\n", "(u'NP', [(u'the', u'DT'), (u'only', u'JJ'), (u'unassailable', u'JJ'), (u'definition', u'NN')])\n", "(u'VP', [(u'is', u'VBZ')])\n", "(u'VP', [(u'often', u'RB'), (u'attributed', u'VBN')])\n", "(u'NP', [(u'the', u'DT'), (u'great', u'JJ'), (u'editor', u'NN'), (u'John', u'NNP'), (u'W', u'NNP'), (u'Campbell', u'NNP')])\n", "(u'NP', [(u'Science', u'NN'), (u'fiction', u'NN')])\n", "(u'VP', [(u'is', u'VBZ')])\n", "(u'NP', [(u'I', u'PRP')])\n", "(u'VP', [(u'say', u'VBP')])\n", "(u'NP', [(u'it', u'PRP')])\n", "(u'VP', [(u'is', u'VBZ')])\n" ] } ], "prompt_number": 57 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# \"Sentiment analysis\"" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.en import sentiment" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 58 }, { "cell_type": "code", "collapsed": false, "input": [ "sg_result = tw.search('causeway', count=50, geo=geocode('Singapore')[:2])\n", "my_result = tw.search('causeway', count=50, geo=geocode('Kuala Lumpur')[:2])" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "prompt_number": 59 }, { "cell_type": "code", "collapsed": false, "input": [ "sg_sentiment = sorted(sentiment(tweet.text)[0] for tweet in sg_result)\n", "my_sentiment = sorted(sentiment(tweet.text)[0] for tweet in my_result)\n", "\n", "sg_avg = sum(sg_sentiment) / len(sg_sentiment)\n", "my_avg = sum(my_sentiment) / len(my_sentiment)\n", "\n", "sg_avg, my_avg" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 60, "text": [ "(0.079808080808080814, -0.24100000000000008)" ] } ], "prompt_number": 60 }, { "cell_type": "code", "collapsed": false, "input": [ "x = linspace(0, 50)\n", "plot(sg_sentiment, 'r-', label='sg')\n", "plot(x, [sg_avg for i in xrange(0, 50)], 'r--', label='sg_avg')\n", "plot(my_sentiment, 'g-', label='my')\n", "plot(x, [my_avg for i in xrange(0, 50)], 'g--', label='my_avg')\n", "grid(b=True, which='both')\n", "legend(loc='best')" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 61, "text": [ "" ] }, { "metadata": {}, "output_type": "display_data", "png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAEACAYAAABfxaZOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xl4VOX5//F3SICELYsgELaIgIhl0SKKgo2CCkVxaYuC\nVOJCqVZqv9qviL0uCL+2X4FqXYpFq0VAK4itaxUEgYGIFaVmkDUosgiRICRIEBKWnN8fZ5IzhEky\n+5yZ83ld17mYM3nmnCc3cOfJfZ7zHBARERERERERERERERERERERERERkTgwDNgKfAFM8vH11sAS\nwA1sBPKi1jMREQmrZOBLIAdojJnYz6/VJh941PO6NXAQSIlO90RExFujED8/ADPp7wROAAuBG2q1\n+QZo5XndCjPpnwzxvCIiEoRQR9wdgK+99vcAl9Rq8zywAigGWgKjQjyniIgEKdSRvuFHm0cwyz7Z\nQD/gGczkLyIiURbqSH8v0MlrvxPmaN/bZcAfPa+3AzuA84B13o2ys7ON4uLiELsjIuI424Fu/jYO\ndaS/DuiOeSG3CXAL8HatNluBoZ7XbTET/le1D1RcXIxhGNoMg6lTp8a8D3bZFAvFQrGofwPODSRp\nhzrSPwncB7yPOZPn78AWYILn688B/we8CKzH/CHzEFAa4nkT2s6dO2PdBdtQLCyKhUWxCF44pk4u\n9mzenvN6fQC4PgznERGREIVa3pEIyMvLi3UXbEOxsCgWFsUieEmx7oAXw1OfEhERPyUlJUEAudz2\nI/2srCySkpK0RWjLysqK9V9xvVwuV6y7YBuKhUWxCJ7tl0MoKytDvwFEjmeUICIOYaf/8T7LO0lJ\nSUr6EaT4isS3hCvviIhI+Cjpi62pdmtRLCyKRfCU9EVEHEQ1fYdTfEVsrqoKGtU9PldNX0QkgTz0\nuwH89c+jw3Y8Jf0QzJgxg44dO9KqVSt69uzJihUrOHbsGOPGjSMrK4tevXoxc+ZMOnXq1PDBxCfV\nbi2KhcVJsdi/fwdpnc4J2/FsP0/froqKinjmmWdYt24d7dq1Y/fu3Zw8eZJp06axe/duduzYwZEj\nRxg+fLjmwotIcA4fpqzyO7J6XhS2Q8b/SD8pKTxbgJKTk6msrGTTpk2cOHGCzp0707VrV1577TUe\neeQR0tPT6dChA/fff79q5iHIzc2NdRdsQ7GwOCYWH31EaZvmZLY6O2yHjP+kbxjh2QLUrVs3nnzy\nSfLz82nbti2jR4+muLiY4uLi08o5HTt2DOd3KyJOsno1ZelNyUoL33Ip8Z/0Y2j06NEUFBSwa9cu\nkpKSmDRpEu3bt+frr63HBnu/lsA5qXbbEMXC4phYFBRQ2uQUmamZYTukkn6Qtm3bxooVK6isrKRp\n06akpqaSkpLCqFGjePTRRzl06BB79+5l1qxZqumLSOAqKqCwkNKq7zXSt4PKykomT55MmzZtaN++\nPQcOHODRRx9lypQpdOzYkXPOOYdrrrmGn/3sZzRp0iTW3Y1bjqnd+kGxsDgiFp98wrEf9AQgrXFa\n2A6r2TtB6t27N2vXrvX5tfnz59e8nj17tqZsikjgCgooHXQRWWnFYT2sRvphtm/fPtasWUNVVRVF\nRUX8+c9/5qabbop1t+KWY2q3flAsLI6IxerVlPX/QVhLOxCepD8M2Ap8AUyqo00uUAhsBFxhOKdt\nHT9+nF/+8pe0atWKIUOGcOONN3LvvffGulsiEk9OnoSPP6a01zlkpoXvIi6EvvZOMlAEDAX2Ap8C\no4EtXm0ygDXAtcAeoDXmw9Jr09o7MaD4itjQf/8L48bx1j//yBz3HN669a06m0Z77Z0BwJfATuAE\nsBC4oVabMcC/MBM++E74IiJSbfVqGDyY0mOlYZ2uCaEn/Q6A90T0PZ73vHUHsoCVwDrg5yGeUxzE\nEbVbPykWloSPRUEBXHEFZRVlYa/phzp7x5+6QGPgImAI0Az4D/Ax5jWA0+Tl5ZGTkwNARkYG/fr1\nC7F74q/q/0TVU+Hssm/3/kVz3+1226o/sdx3u9226k9Y9w0D1/LlMHp0zUjf++sul4u5c+cC1OTL\nQIRa078UyMe8mAswGagCZni1mQSkedoBvAAsAf5Z61iq6ceA4itiM1u2wI9/DDt2cO+793JBmwv4\n1YBf1dk82jX9dZjlmxygCXAL8HatNm8BgzAv+jYDLgE2h3heEZHEtHo1XHEFAGUVZWGfvRNq0j8J\n3Ae8j5nIX8WcuTPBs4E5nXMJ8DmwFngeJX3xU8LXbgOgWFgSOhYFBTB4MAClx0ptV9MHWOzZvD1X\na/8xzyYiIvUpKIApUwAoOxb+C7l2WglMNf0YUHxFbGTXLhgwAPbtg6Qkuj3djcW3Lab7Wd3r/Iie\nkRtF4Xxc4vTp0+nWrRutWrXiggsu4M033wTMhd0yMjLYtGlTTdtvv/2WZs2aceCAecvDzJkzyc7O\npmPHjrzwwgs0atSIr776KjLftIhEjmeqZvWDnSIxZVNJP0jej0s8fPgwS5cuJScn57THJS5btoyX\nX37Zr6WVu3Xrxocffsjhw4eZOnUqY8eOpaSkhKZNm/KTn/yEBQsW1LRdtGgRubm5tG7dmiVLlvDE\nE0+wfPlyvvjiC1wuV0It5ZzQtdsAKRaWhI2F56YsgCqjiu8qviMjNSOsp4j/pJ+f7/vxh/n5/rev\nq209wv24xJ/+9Ke0a9cOgFGjRtG9e/eaVTzHjBnDwoULa9q+8sorjBkzBjB/ANx5552cf/75pKWl\nMW3aNJVrROJV9UgfOFx5mBZNWpDcKDnGnYocw5e63reDV155xRg0aJCRmZlp3HrrrcbevXuN1NRU\nY8uWLTVtlixZYnTs2LHBY82bN8/o16+fkZGRYWRkZBgpKSnGnDlzDMMwjJMnTxrt27c31q5da+zY\nscNo3ry5ceTIEcMwDGPYsGHG7Nmza45TUVFhJCUlGdu3b/fre7BzfEUcpaTEMNLTDePkScMwDGN7\n6XYj58mcBj+GfzfJ1oj/kX4Mhetxibt27eIXv/gFzzzzDKWlpZSVlfGDH/ygZsSenJzMqFGjWLBg\nAQsWLOD666+nefPmAHo8o0ii+PBDuOwySDZH9pGYrglK+kEL5+MSv//+e5KSkmjdujVVVVW8+OKL\nbNy48bQ21SUe79IOmKWgF198ka1bt3L06FF+//vfR+T7jZWErd0GQbGwJGQsvEo7YE7XDPdia6Ck\nH7RwPi6xV69ePPjggwwcOJB27dqxceNGBg0adFqbAQMG0KJFC7755huGDx9e8/6wYcP49a9/zZVX\nXkmPHj0YOHAgAE2bNg3/Ny0ikeN1ERciN9K30zQPw0jAefqzZ89m0aJFrFy5Mirn27JlC7179+b4\n8eM0atTwz/R4j69IXKmshOPHz3z/yBHo3h0OHgTPgO3Zdc/i3ufm2euerfeQmqcfY7F4XOIbb7xB\nZWUlZWVlTJo0iZEjR/qV8EUkik6dguxs31uPHnDttTUJH4jIWvqgpB92dT0ucffu3bRs2fKMrVWr\nVuzZs6fhA9fjb3/7G23btqVbt240btyY2bNnh+m7ib2ErN0GSbGwxGUstm2DzEwoL/e9/etfpzWP\nxBIMEJ61d8RL586d2bBhg8/3y8vLI3LOxYtrL30kIrZTWAgXXuh389JjpZzX+rywd0MjfbG16odM\niGLhLS5jEWDSj8QSDKCkLyISHZ99Bhdd5Hdz1fTFkeKydhshioUl7mJhGEGVdzTSFxGJR7t3Q2oq\ntG3r90ci8dQsUNIXm4vL2m2EKBaWuItFgKN80EhfRCR+BZj0K09WcuLUCZo3bh72rijpi63FXe02\nghQLS9zFIoiZO5lpmRF5NkY4kv4wzIeffwFMqqfdxZgPUr85DOcUEYkfgSb9CN2YBaEn/WRgFmbi\n7wWMBs6vo90MYAn2Wu8naDk5OTz22GP06dOHli1bctddd1FSUsLw4cNJT0/n6quv5tChQ4wYMYJZ\ns2ad9tk+ffrw1ltvxajn8SXuarcRpFhY4ioW335r3nF7zjl+fyRS0zUh9KQ/APgS2AmcABYCN/ho\nNxH4J/BtiOezjaSkJF5//XWWL19OUVER//73vxk+fDjTp09n//79VFVV8fTTT5OXl8fLL79c87n1\n69dTXFzMiBEjYth7EYma6lF+AKWaSN2YBaEvw9AB8H5qxx7gEh9tbgCuwizxhHVJx6Rp4fnFwZga\neLcmTpxImzZtABg8eDBt27alb9++ANx0000sX76cSZMmMWHCBLZv3865557LSy+9xK233kpKilbA\n8IfL5YqvUV0EKRaWuIpFkDN3IjFdE0JP+v5kyieBhz1tk6invJOXl0dOTg4AGRkZ9OvXr+EOBJGs\nw6Wt15zbtLS00/ZTU1M5cuQITZs2ZdSoUbz00ktMnTqVhQsX8q9aCyvZQfWFser/SHbZt3v/ornv\ndrtt1Z9Y7rvdblv1p979wkJcXbuC1w+qhj7/yZpPOFp+lGreX3e5XMydOxegJl9G06WYdfpqkznz\nYu5XwA7PVg6UACN9HKvO5z/aUU5OjrF8+fKa/bFjxxr5+fk1+88//7wxdOhQwzAM46OPPjK6detm\nLF261OjevXvU+1ofu8ZXJGH06GEYGzYE9JEpK6YYU1dO9astAVZPQh3prwO6AzlAMXAL5sVcb129\nXr8IvAO8HeJ548rAgQNJSkrit7/9LbfffnusuyMi0VJeDnv2QM+eAX2s9Fgp3c/qHpEuhXoh9yRw\nH/A+sBl4FdgCTPBsjuI9pzYpKem0/dtvv50NGzYwduzYWHQtbsXdfOwIUiwscROL9evhggsgwGt4\ndr6QC7DYs3l7ro62d4ThfLawY8eO0/Zfeuml0/bvuusu7rrrrpr9Ll26MGjQoJjU4EQkRgoLA1pZ\ns5qdp2yKH44ePcozzzzDL37xi1h3Je7EzQyNKFAsLHETiyBm7kBkR/pK+hH2/vvvc/bZZ9O+fXvG\njBkT6+6ISDQFmfQjOWVTST/Crr32Wo4cOcIbb7yhh5UHIW5qt1GgWFjiIhbHj0NREfTuHfBHI7XC\nJijpi4hExqZN0LUrpKUF9DHDMDhUcUg1fXGmuKndRoFiYYmLWARZ2ik/Xk5qSiqNkxtHoFNK+iIi\nkfHZZ8FdxD1WFrFRPijpi83FRe02ShQLS1zEIoTpmpGq54OSvohI+J06BZ9/Dn6sH1ZbJKdrgpK+\n2Fxc1G6jRLGw2D4WX34JbdpARkbAH43kdE1Q0hcRCb8gL+KC56lZqRrpi0PFRe02ShQLi+1jEULS\n10jfpiL9uMT777+fzp07k56eTv/+/fnwww8BKC4uplmzZpSVldW0LSwspE2bNpw6dYpTp07x4IMP\n0qZNG7p27cqsWbNo1KgRVVVV4Q+CiPgWYtKPZE3fTupcK9qOcnJyjIEDBxr79+839u7da5x99tnG\nhRdeaLjdbqOiosK46qqrjGnTphmLFi0yLrnkkprPud1u46yzzjJOnDhR7/Fffvllo7S01Dh16pTx\n+OOPG+3atTMqKysNwzCMq666ynj++edr2v72t7817rnnHsMwDGP27NlGr169jL179xplZWXGkCFD\njEaNGhmnTp3yeR67xlckblVVGcZZZxlGcXFQHx//9njj2U+f9bs9Aa6nH/cj/XxXPknTks7Y8l35\nfrevq21Dqh+XmJ2dzeDBgxk4cCB9+/aladOm3HTTTRQWFjJy5Ei2bdvG9u3bAfx+XOJtt91GZmYm\njRo14oEHHqCyspKioiIAxowZw4IFCwDz7r1XX321Zl2fRYsW8Zvf/Ibs7GwyMjKYPHky5r8LEYmK\nPXvMpZTbtw/q45Ee6cf9g1rzc/PJz82PWPv6RPJxiY899hhz5syhuLiYpKQkDh8+zIEDBwC4+eab\nmThxIvv27aOoqIhGjRoxaNAgAL755hs6depUc5yOHTuG5XuNFVc8PQs1whQLi61jEUJpB8wpm5Gs\n6cd90reTukbU48aN4/bbb+fyyy+nWbNmXHJJ7WfHn66goIA//elPrFixggsuuACArKysmuNnZmZy\nzTXX8Oqrr7J582ZGj7YeVta+fXu+/tp6Vr33axEJ0FdfwfffB/aZDz4IKelrpJ8AAn1cYnl5OSkp\nKbRu3Zrjx48zffp0Dh8+fFqbMWPGMH36dHbv3s3KlStr3h81ahRPPfUUI0aMoFmzZsyYMeO0J3jF\nG9uO5mJAsbBEJRbPPQePPALZ2YF9LikJ/vKXoE9bdiyyN2cp6YdRQ49LnDJlSoOzdgCGDRvGsGHD\n6NGjB82bN+d//ud/6Ny582ltRo4cyd13302XLl3o7bV06/jx49m2bRt9+vQhPT2diRMnsmrVKi3r\nLBKI+fPhD3+ATz81V8qMokg+Nctu6rwynQjmz59vDB48OOrnfe+994wuXbrU+XW7x3flypWx7oJt\nKBaWiMZi0SLDaN/eMLZsidw56nD85HEj5f+lGFVVVX5/hhjM3hkGbAW+ACb5+PptwHrgc2AN0CcM\n54wr0XxcYkVFBe+99x4nT55k7969TJs2jZtvvjni5xVJCO+8AxMnwpIl0LNn1E9/qOIQ6U3TbV2S\nTQa+BHKAxoAbOL9Wm4FAuuf1MODjOo5V50+xeLZkyRKjefPmxo033njaXPnVq1cbLVq0OGNr2bJl\nSOc7evSocfHFFxstW7Y0zj77bOPOO+80ysvL62wf7/EVCZulSw2jTRvD+PTTmHVh67dbje5Pdw/o\nMwQ40g+1pj8AM+nv9OwvBG4Atni1+Y/X67VAfM8hDFD14xJrGzx4MOXl5WE/X1paGp988knYjyuS\n0FavhjFj4M03oX//mHUj0tM1IfSk3wHwnhO4B6hvPuJdwHshnlMcxNbzsaNMsbD4jIVhwMaN5rNp\nA7F3L9x9NyxcCJdfHrY+BiMaSzCEmvQD+bXiSuBOoM6o5uXlkZOTA0BGRgb9gliLWoJTvYBV9X8k\nu+zbvX/R3He73bbqTyz33W736V9//XX405/ILS6G1q1xeX6Lzm3Z0vx6ffuNGuF64AFITsY8Wuy+\nv7Is86lZ9bV3uVzMnTsXoCZfBiLUqwWXAvmYtXqAyUAVMKNWuz7A6552X9ZxLE95qlYHk5K0jEAE\nKb4S9954A+65xxytT5kCTZrEukdB+8vav1B0sIhZP57VcGMPz0Vfv3N5qCP9dUB3zAu5xcAtwOha\nbTpjJvyx1J3w65SZmWnrK9nxLjPTGfOBJQEdPgz33w8FBfD663DZZbHuUcgi/dQsCH3K5kngPuB9\nYDPwKuZF3AmeDWAKkAnMBgqBgK4ylpaWYhiGo7aVK1dG7VylpaUh/hOILNuvmx5FioXF9dRT0Lev\nOap3uxMi4UN0bswKxx25iz2bt+e8Xt/t2URELJs2wbx55vNkA/Htt/Duu+Znr7suMn2LkbKKMi5s\nF/y6Pf6wU93EZ01fRBJMRYW5xMFzz5m1+PT0hj/jLSXFnF7Zpk1k+hdD1y+4nvEXjWfkeSP9/ky0\na/oiIv5bsQImTIB+/WD9+sAXM0tw0ZiyqVW4bEi1W4tiYYnrWBw8CHfcAXl58Pjj8NprISX8uI5F\nPcqOlcVFTV9E7Ki4GP7zn4bbRdo335jlnFtvNev4nrnycqZojPRV0xdJNAcPwvTpMGcODBpk1sBj\nqUkTeOABuPji2PbD5gzDIPWPqXz38HekpqT6/TnV9EWcqrwcnngCnn4aRo2CDRtUM48jR08cJTkp\nOaCEHwzV9G0oUeuVwVAsLHXGoqLCTPbdusG2bbB2Lfz1rwmd8BPx30U0bswCjfRF7OvAAfjoI1iz\nBj7+2KyN+6qHFxfDgAHms1m9nqIm8aX0WGnEV9gE1fRF7OO778zlBNasMbfiYrjkEnPlx8sug6w6\nRoGtWkH37tHtq4Tdqp2rmOKawqq8VQF9TjV9kXj1y1+ao/sbboBf/coctcf6IqxETbSejauavg0l\nYr0yWI6JxRdfmOWZ11+H++6DCy88I+E7JhZ+SMRYRGO6Jijpi9jD9Onm6F5z2B2rrCLyN2bZjWH4\n2qZO9f1gyKlT1V7tE6P9rl2GkZpqn/6ofUzaT/5gsvGHVX8I+PgQ2DNydSFXJNYmToS0NJg5M9Y9\nkRi659/30Lttb+69+N6APhfohVyVd2woEeuVwUr4WJSUwD/+Yd6x2oCEj0UAEjEWpRWq6Yskviee\nMJcJbtcu1j2RGCs7Fp2bs1TeEYmVsjLzLtrPPoMuXWLdG4mx/n/rz+wRs7m4Q2BrFKm8IxIv/vIX\nGDlSCV+A6C3DoKRvQ4lYrwxWwsbiyBGYNQseftjvjyRsLIKQiLGI1jIM4Uj6w4CtwBfApDraPO35\n+nogsg+AFIkHzz4LV14J550X656IDZyqOkV5ZTnpTQN8dGQQQq3pJwNFwFBgL/ApMBrY4tXmx8B9\nnj8vAZ4CLvVxLNX0xRkqKqBrV1i8GPr2jXVvxAZKj5Vy7tPnUjapLODPRrumPwD4EtgJnAAWAjfU\najMSmOd5vRbIANqGeF6R+DVnDlx0kRK+1IjWEgwQ+oJrHYCvvfb3YI7mG2rTESjx6wwffWQ+CchB\nXBs2kKslcoEEjcXMmbBgQcAfc7lc5Obmhr8/cShasXDtdFFeWR7x82wv2x61JRhCTfr+1mNq/+rh\n83N5eXnk5OQAkJGRQb++fcm99lq45hpcnsSf27o1AK4DBxJ3/8ABXO+8Y5/+xHAfgP/8xzb9Ccv+\n6NG4KivBK3FVX5isb9/tdgfUPpH33W53xM+378g+xm8YT25OLgc2m39/rXuZf5+R2O9/dn+q1dc/\nl8vF3LlzAWryZSBCrelfCuRjXswFmAxUATO82jwLuDBLP2Be9P0RZ470z6zpHz4MHTqYj4ETEYmi\n+evn8862d3jtZ6/Fuiv1inZNfx3QHcgBmgC3AG/XavM2cLvn9aXAIfwt7ZSW1v3gCBGRCCrYVcAV\nna+IdTfCLtSkfxJzZs77wGbgVcyZOxM8G8B7wFeYF3yfA/xfTaisDDKdtdQoJOYc5GApFhbFwhKN\nWKzevZrBXQZH/DzRFo7H8iz2bN6eq7V/X1BH1khfRGKg5EgJJUdK6H12gk0iwO535JaVOTLpa4aG\nRbGwKBaWSMeiYHcBl3e+nORGyRE9TyzYO+mXljqyvCMisZWo9XyIh6TvwJG+arcWxcKiWFgiHYuC\n3QUJWc8Huyd9h17IFZHY+a7iO7Yd3Eb/7P4NN45D9k76Dh3pq3ZrUSwsioUlkrH46OuPuLjDxTRJ\nbhKxc8SSvZO+RvoiEmWrd61O2Ho+2D3pO3Skr9qtRbGwKBaWSMYikev5YPekr5G+iETRsRPHKNxX\nyKUdfa3+nhjs/YzcLl1g1SoIYlEhEZFArdq5ioc+eIi1d6+NdVf8lljPyHXozVkiEhsFuwsY3Dlx\nSztg56R/4gQcPQotW8a6J1Gn2q1FsbAoFpZIxWL1rtVc0SVxL+KCnZN+dT0/yU4VKBFJVCerTvLx\nno+5vNPlse5KRNkpo55e0y8qguuvh23bYtcjEXGMdcXruOOtO9hwz4ZYdyUgiVPTd+h0TRGJjdW7\nVid8PR/snPQdPF1TtVuLYmFRLCyRiEXB7oKEr+eDnZO+RvoiEiVVRhUFuxJ/5g7YOek7eKSvNVYs\nioVFsbCEOxZbD2wlPTWdDq06hPW4dmTfpK+RvohEiVPq+WDnpO/gG7NUu7UoFhbFwhLuWDjhpqxq\noSb9LGAZsA1YCmT4aNMJWAlsAjYCv/bryHpqlohEgWEYjrgpq1qo8/RnAgc8f04CMoGHa7Vp59nc\nQAvgv8CNwJZa7U6fp3/ddTBhgjlXX0QkQnYe2smlL1zKNw9+Uz3nPa5Ee57+SGCe5/U8zGRe2z7M\nhA9wBDPZZzd4ZAdfyBWR6CnYZU7VjMeEH4yUED/fFijxvC7x7NcnB7gQaHgJOwdfyHW5XJqp4aFY\nWBQL09LtS3n/g/c5r/95YTneok2LuOG8G8JyrHjgT9Jfhlmeqe13tfYNz1aXFsA/gfsxR/xnyMvL\nI8ezjHLGnj3027KF3F69AOvCTfU/eu07Y7+aXfoTy323222r/sRif2uLrTz64aNkb8ym6GAR2b3N\nokHxhmKAoPa7ZXWj3YF2uLx+qNrl+/W173K5mDt3LkBNvgxEqL/PbAVyMUs47TEv2Pb00a4x8G9g\nMfBkHceyavqGAU2bwuHDkJoaYhdFJBHMXz+fR5Y/wqq8VZybdW6su2Mb0a7pvw2M87weB7zpq0/A\n34HN1J3wT3f0KKSkKOGLCACvbXqNhz94mGU/X6aEH6JQk/504GrMKZtXefbBvFD7ruf15cBY4Eqg\n0LMNq/eoDq7ng+Zje1MsLE6NxTtF73Df4vtYfNtizm9zPuDcWIRDqBdyS4GhPt4vBkZ4Xn9IoD9c\nHHxjlohYlm1fxl1v38W7Y96lb7u+se5OQrDTHCWrpu9ywdSp5vNxRcSRVu9azU8X/ZTXb3mdQZ0H\nxbo7thVoTd9WSZ/8M9+c+qOp5Oee+YV8Vz7TVk1Te7VX+wRs/8neT7juletY8JMFDOk6JOb9sXP7\nuE76NSP9v/8d1qyBOXNi26MY8Z465nSKhcUpsXDvc3Pty9cyZ+QcRvQY4bONU2Lhj8R4cpbDL+SK\nONXmbzcz/B/D+euP/1pnwpfQ2HOk/8gj0Lw5/K72/V8ikqi+LP2S3Lm5TB86nbF9xsa6O3FDI30R\niTu7Du1i6PyhTP3RVCX8CLNn0nf4Ymuag2xRLCyJGovi8mKGzB/CAwMfYPwPx/v1mUSNRTTYM+lr\npC/iCPu/38+Q+UMYf9F4fn2Jf4/akNDYqqb/2JrHzFdPPgk//Ql07BTbHolIRM3/fD43nncj0648\nc3qi+CfQmn6od+SGVXF5sefVYaAcavZFJBHd0/8eJvxwQqy74Si2GunXzN7JyIAdOxxb19ccZIti\nYVEsLIqFJf5n75w6BUeOQHp6rHsiIpJw7DfSP3gQunc3L+aKiEi94n+k7/DpmiIikWS/pK/pmpqD\n7EWxsCgWFsUiePZL+lpLX0QkYuxX01+wAN56CxYujHV/RERsLzFq+hrpi4hEhP2Sfmmp4y/kql5p\nUSwsioU77IS5AAAII0lEQVRFsQheKEk/C1iG+VD0pUBGPW2TMR+I/k6DR9WFXBGRiAmlpj8TOOD5\ncxKQCTxcR9sHgB8CLYGRdbQxa/p33AGDB8Odd4bQNRERZ4hmTX8kMM/zeh5wYx3tOgI/Bl7wq2Ma\n6YuIREwoSb8tUOJ5XeLZ9+UJ4H+BKr+OqpuzVK/0olhYFAuLYhG8hlbZXAa08/F+7ecYGp6ttuuA\n/Zj1/NyGOpOXl0fO1q2wcCEZhYX069evZlGl6r9k7Ttrv5pd+hPLfbfbbav+xHLf7Xbbqj/R3He5\nXMydOxeAnJwcAhVKTX8rZiLfB7QHVgI9a7X5P+DnwEkgFWgF/Au43cfxzJp+hw7wySfmnyIiUq9o\n1vTfBsZ5Xo8D3vTR5hGgE3AOcCuwAt8J36IpmyIiERNK0p8OXI05ZfMqzz5ANvBuHZ/xVQKyHDsG\nhgFpaSF0K/7VLm04mWJhUSwsikXwQnlyVikw1Mf7xcAIH++v8mx1q76Im2Sn1SFERBKHnbKrYWzY\nALfcAps2xbovIiJxIb7X3tF0TRGRiLJX0teNWYDqld4UC4tiYVEsgmevpK+RvohIRNmrpv/44/D1\n1/DEE7Hui4hIXIj/mr7KOyIiEWOvpK8bswDVK70pFhbFwqJYBM9+SV8jfRGRiLFXTf/aa+H++2H4\n8Fj3RUQkLsR3TV8jfRGRiLJX0teUTUD1Sm+KhUWxsCgWwbNX0tdIX0QkouxV009OhooKSAllHTgR\nEeeI75p+8+ZK+CIiEWSvpK/SDqB6pTfFwqJYWBSL4Nkr6esirohIRNmrpj9kCHzwQaz7ISISN+K7\npq+RvohIRIWS9LOAZZjPyF0KZNTRLgP4J7AF2AxcWvcRVdMH1Su9KRYWxcKiWAQvlKT/MGbS7wEs\n9+z78hTwHnA+0Acz+fumkb6ISESFUtPfCvwIKAHaAS6gZ6026UAh0NWP4xnGjBnw0EMhdElExFmi\nWdNvi5nw8fzZ1kebc4BvgReBz4DngWZ1HlEjfRGRiGoo6S8DNvjYRtZqZ3i22lKAi4C/ev78nrrL\nQKrpe6heaVEsLIqFRbEIXkO3v15dz9eqyzr7gPbAfh9t9ni2Tz37/6SepJ/3t7+Rs2EDABkZGfTr\n14/c3FzA+kvWvrP2q9mlP7Hcd7vdtupPLPfdbret+hPNfZfLxdy5cwHIyckhUKHU9GcCB4EZmIk8\nA98JfTVwN+Ysn3wgDZjko51hFBZCv34hdElExFkCremHkvSzgEVAZ2AnMAo4BGRj1u5HeNr1BV4A\nmgDbgTuA73wczzB27YLOnUPokoiIs0TzQm4pMBRzyuY1mAkfoBgr4QOsBy7GTP434zvhm3QhF1C9\n0ptiYVEsLIpF8Ox1R26LFrHugYhIQrPX2juGrwlAIiJSl/hee0dERCJKSd+GVK+0KBYWxcKiWARP\nSV9ExEFU0xcRiWOq6YuISJ2U9G1I9UqLYmFRLCyKRfCU9EVEHEQ1fRGROKaavoiI1ElJ34ZUr7Qo\nFhbFwqJYBE9JX0TEQVTTFxGJY6rpi4hInZT0bUj1SotiYVEsLIpF8JT0RUQcRDV9EZE4ppq+iIjU\nKZSknwUsA7YBS4GMOtpNBjYBG4BXgKYhnNMRVK+0KBYWxcKiWAQvlKT/MGbS7wEs9+zXlgOMBy4C\negPJwK0hnNMR3G53rLtgG4qFRbGwKBbBCyXpjwTmeV7PA2700eYwcAJoBqR4/twbwjkd4dChQ7Hu\ngm0oFhbFwqJYBC+UpN8WKPG8LvHs11YKPA7sBoqBQ8AHIZxTRERCkNLA15cB7Xy8/7ta+4Znq+1c\n4DeYZZ7vgNeA24B/BNRLh9m5c2esu2AbioVFsbAoFsELZcrmViAX2Ae0B1YCPWu1uQW4Grjbs/9z\n4FLgVz6O9yXmDwkREfHfdqBbNE40E5jkef0wMN1Hm77ARiAN8wfMPHwnfBERsbkszPp87Smb2cC7\nXu0ewpqyOQ9oHMU+ioiIiIhIrAzDvD7wBVa5yCnmYM582uD1nr83vSWaTpjXhTZhlgR/7XnfifFI\nBdYCbmAz8KjnfSfGoloyUAi849l3aix2Ap9jxuITz3txFYtkzAu4OZhlHzdwfiw7FGWDgQs5PenP\nxCyJgflD0Ne1kkTUDujned0CKML8t+DUeDTz/JkCfAwMwrmxAHgAc9bf2559p8ZiB2aS9xZXsRgI\nLPHafxjfd/YmshxOT/pbse55aOfZd6I3gaEoHs2AT4ELcG4sOmJeP7wSa6Tv1FjsAM6q9V5AsYj1\ngmsdgK+99vd43nMyf256S3Q5mL8BrcW58WiE+ZtvCVbZy6mxeAL4X6DK6z2nxsLA/AG4DnOJGwgw\nFg3dnBVpWku5fnXd9JbIWgD/Au4Hymt9zUnxqMIsd6UD72OOcr05JRbXAfsxa9i5dbRxSiwALge+\nAdpg1vFrj+objEWsR/p7MS/gVeuEOdp3shKsu6DbY/6Dd4rGmAn/JczyDjg7HmDeyf4u8EOcGYvL\nMNf52gEsAK7C/PfhxFiAmfABvgXeAAYQYCxinfTXAd0xf51vgnkH79v1fcAB3gbGeV6Pw0p+iS4J\n+DvmbJUnvd53YjxaY83ASMO8q70QZ8biEczB4DmYK/SuwLyz34mxaAa09LxuDlyDeT0w7mIxHHOm\nxpeYa+87yQLMheiOY17buIO6b3pLdIMwSxpuzARXiDmd14nx6A18hhmLzzHr2eDMWHj7Edag0Imx\nOAfz34Qbc1pzdb50YixERERERERERERERERERERERERERERERERERKLr/wOo9hYrAxUTCgAAAABJ\nRU5ErkJggg==\n", "text": [ "" ] } ], "prompt_number": 61 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# WordNet" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.en import wordnet" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "prompt_number": 62 }, { "cell_type": "code", "collapsed": false, "input": [ "birds = wordnet.synsets('bird')\n", "birds" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 63, "text": [ "[Synset(u'bird'),\n", " Synset(u'bird'),\n", " Synset(u'dame'),\n", " Synset(u'boo'),\n", " Synset(u'shuttlecock')]" ] } ], "prompt_number": 63 }, { "cell_type": "code", "collapsed": false, "input": [ "bird = birds[0]\n", "\n", "'Definition', bird.gloss" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 64, "text": [ "('Definition',\n", " u'warm-blooded egg-laying vertebrates characterized by feathers and forelimbs modified as wings')" ] } ], "prompt_number": 64 }, { "cell_type": "code", "collapsed": false, "input": [ "'- Synonyms', bird.synonyms" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 65, "text": [ "('- Synonyms', [u'bird'])" ] } ], "prompt_number": 65 }, { "cell_type": "code", "collapsed": false, "input": [ "'^ Hypernyms', bird.hypernyms()" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 66, "text": [ "('^ Hypernyms', [Synset(u'vertebrate')])" ] } ], "prompt_number": 66 }, { "cell_type": "code", "collapsed": false, "input": [ "'v Hyponyms', bird.hyponyms()" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 67, "text": [ "('v Hyponyms',\n", " [Synset(u'dickeybird'),\n", " Synset(u'cock'),\n", " Synset(u'hen'),\n", " Synset(u'nester'),\n", " Synset(u'night bird'),\n", " Synset(u'bird of passage'),\n", " Synset(u'protoavis'),\n", " Synset(u'archaeopteryx'),\n", " Synset(u'Sinornis'),\n", " Synset(u'Ibero-mesornis'),\n", " Synset(u'archaeornis'),\n", " Synset(u'ratite'),\n", " Synset(u'carinate'),\n", " Synset(u'passerine'),\n", " Synset(u'nonpasserine bird'),\n", " Synset(u'bird of prey'),\n", " Synset(u'gallinaceous bird'),\n", " Synset(u'parrot'),\n", " Synset(u'cuculiform bird'),\n", " Synset(u'coraciiform bird'),\n", " Synset(u'apodiform bird'),\n", " Synset(u'caprimulgiform bird'),\n", " Synset(u'piciform bird'),\n", " Synset(u'trogon'),\n", " Synset(u'aquatic bird'),\n", " Synset(u'twitterer')])" ] } ], "prompt_number": 67 }, { "cell_type": "code", "collapsed": false, "input": [ "'^ Holonyms', bird.holonyms()" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 68, "text": [ "('^ Holonyms', [Synset(u'Aves'), Synset(u'flock')])" ] } ], "prompt_number": 68 }, { "cell_type": "code", "collapsed": false, "input": [ "'v Meronyms', bird.meronyms()" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 69, "text": [ "('v Meronyms',\n", " [Synset(u'beak'),\n", " Synset(u'furcula'),\n", " Synset(u'feather'),\n", " Synset(u'wing'),\n", " Synset(u'pennon'),\n", " Synset(u\"bird's foot\"),\n", " Synset(u'uropygium'),\n", " Synset(u'hindquarters'),\n", " Synset(u'air sac'),\n", " Synset(u'uropygial gland'),\n", " Synset(u'syrinx'),\n", " Synset(u'bird')])" ] } ], "prompt_number": 69 }, { "cell_type": "code", "collapsed": false, "input": [ "wordnet.synsets('owl')[0].holonyms(), wordnet.synsets('amoeba')[0].holonyms()" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 70, "text": [ "([Synset(u'Strigiformes')], [Synset(u'Amoebida')])" ] } ], "prompt_number": 70 }, { "cell_type": "code", "collapsed": false, "input": [ "kitty = wordnet.synsets('kitten')[0]\n", "pup = wordnet.synsets('puppy')[0]\n", "\n", "wordnet.ancestor(kitty, pup)" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 71, "text": [ "Synset(u'young mammal')" ] } ], "prompt_number": 71 }, { "cell_type": "code", "collapsed": false, "input": [ "human = wordnet.synsets('human')[0]\n", "cuy = wordnet.synsets('guinea pig')[0]\n", "\n", "wordnet.similarity(human, cuy), \\\n", "wordnet.similarity(human, kitty)" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 72, "text": [ "(0.26171115255795596, 0.5710216026393958)" ] } ], "prompt_number": 72 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# pattern.search" ] }, { "cell_type": "code", "collapsed": false, "input": [ "s = 'the fluffy brown bunnies hopped across the wet grass with much gusto.'" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "prompt_number": 73 }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.search import search\n", "\n", "search('NP', parsetree(s))" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 74, "text": [ "[Match(words=[Word(u'the/DT'), Word(u'fluffy/JJ'), Word(u'brown/JJ'), Word(u'bunnies/NNS')]),\n", " Match(words=[Word(u'the/DT'), Word(u'wet/JJ'), Word(u'grass/NN')]),\n", " Match(words=[Word(u'much/JJ'), Word(u'gusto/NN')])]" ] } ], "prompt_number": 74 }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.search import taxonomy\n", "\n", "for animal in ('bunny', 'dog', 'cat', 'banana'):\n", " taxonomy.append(animal, type='animal')\n", " \n", "search('ANIMAL', parsetree(s, lemmata=True))" ], "language": "python", "metadata": { "slideshow": { "slide_type": "fragment" } }, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 75, "text": [ "[Match(words=[Word(u'the/DT'), Word(u'fluffy/JJ'), Word(u'brown/JJ'), Word(u'bunnies/NNS')])]" ] } ], "prompt_number": 75 }, { "cell_type": "code", "collapsed": false, "input": [ "from pattern.search import Pattern\n", "\n", "pat = Pattern.fromstring('{JJ} {ANIMAL} {VP}')\n", "match = pat.match(parsetree(s, lemmata=True))\n", "for i in range(0,4):\n", " print(match.group(i))" ], "language": "python", "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[Word(u'fluffy/JJ'), Word(u'brown/JJ'), Word(u'bunnies/NNS'), Word(u'hopped/VBD')]\n", "[Word(u'fluffy/JJ')]\n", "[Word(u'brown/JJ'), Word(u'bunnies/NNS')]\n", "[Word(u'hopped/VBD')]\n" ] } ], "prompt_number": 76 }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# fin\n", "\n", "[more?](https://rahul.ag/pugs-tweet.slides.html)" ] } ], "metadata": {} } ] }