{
 "metadata": {
  "name": "NPL ChiPy Jan 2014"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "Goals of this talk\n==================\n\n* Explore NLTK\n* Have fun thinking about Lexical Graphs\n* Explore the posiblity of comparing differences between langaugas using NLP\n\n"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "import nltk",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "nltk ships with a download utility for downloading grammers and corpora."
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "nltk.download()",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "showing info http://nltk.github.com/nltk_data/\n"
      },
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 2,
       "text": "True"
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "Density\n======="
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "from nltk import word_tokenize as tokenize",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "nltk.pos_tag(tokenize(\"The quick brown fox jumps over the lazy dog.\"))",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 4,
       "text": "[('The', 'DT'),\n ('quick', 'NN'),\n ('brown', 'NN'),\n ('fox', 'NN'),\n ('jumps', 'NNS'),\n ('over', 'IN'),\n ('the', 'DT'),\n ('lazy', 'NN'),\n ('dog', 'NN'),\n ('.', '.')]"
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "nltk.pos_tag(tokenize(\"If I were you I wouldn't do that with these.\"))",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 5,
       "text": "[('If', 'IN'),\n ('I', 'PRP'),\n ('were', 'VBD'),\n ('you', 'PRP'),\n ('I', 'PRP'),\n ('would', 'MD'),\n (\"n't\", 'RB'),\n ('do', 'VB'),\n ('that', 'DT'),\n ('with', 'IN'),\n ('these', 'DT'),\n ('.', '.')]"
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "Create a density checker"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "import re\n\nmatches = lambda x, re_parts: any([re.findall(y, x) for y in re_parts])\n\ndef density(lang_code, data):\n    if lang_code == 'en':\n        content_tags = (\"^NN\", \"^JJ\", \"^V\")    \n    elif lang_code == 'pt':\n        content_tags = (\"\\+n\", \"\\+adj\", \"\\+v\")\n    elif lang_code == \"es\":\n        content_tags = (\"^v\", \"^a\", \"^n\")\n    return len(filter(lambda x: matches(x[1], content_tags), data)) / float(len(data))\n\nassert density('en', [(1, \"NN\"), (2, \"XX\")]) == .5\nassert matches(\"H+n\", (\"n\", \"adj\"))\nassert density('pt', [(1, \"H+n\"), (2, \"H+xxx\")]) == .5\nassert density('es', [(1, \"xxxx\"), (2, \"vmip3s0\")]) == .5\n",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "back to our original example, one phrase is more dense than the other."
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "assert density(\"en\", nltk.pos_tag(tokenize(\"If I were you I wouldn\u2019t do that with these.\"))) \\\n       < density(\"en\", nltk.pos_tag(tokenize(\"The quick brown fox jumps over the lazy dog.\")))\n",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 19
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "Tag Spanish Text\n================"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "from nltk.corpus import cess_esp\nsents = cess_esp.tagged_sents()",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 20
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "Split into training and test set"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "training_dx = int(len(sents)*90/100)\ntraining = sents[:training_dx]\ntest = sents[training_dx+1:]",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 21
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "train tagger and check accuracy (this takes 40 seconds or so) ..."
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "from nltk import HiddenMarkovModelTagger\nspanish_tagger = HiddenMarkovModelTagger.train(training)\n'accuracy %.1f %%' % (spanish_tagger.evaluate(test) * 100)",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 22,
       "text": "'accuracy 84.9 %'"
      }
     ],
     "prompt_number": 22
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "spanish_tagger.tag(tokenize(\"A buen entendedor, pocas palabras bastan.\"))",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 23,
       "text": "[('A', 'sps00'),\n ('buen', 'aq0ms0'),\n ('entendedor', 'np0000p'),\n (',', 'Fc'),\n ('pocas', 'di0fp0'),\n ('palabras', 'ncfp000'),\n ('bastan', 'aq0fp0'),\n ('.', 'Fp')]"
      }
     ],
     "prompt_number": 23
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "spanish_tagger.tag(tokenize(\"El gato blanco se sent\u00f3 en la alfombra.\"))",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 24,
       "text": "[('El', 'da0ms0'),\n ('gato', 'ncms000'),\n ('blanco', 'aq0ms0'),\n ('se', 'p0300000'),\n ('sent\\xc3\\xb3', 'vmip3s0'),\n ('en', 'sps00'),\n ('la', 'da0fs0'),\n ('alfombra', 'ncfs000'),\n ('.', 'Fp')]"
      }
     ],
     "prompt_number": 24
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "Now Portuguese "
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "from nltk.corpus import floresta\nsents = floresta.tagged_sents()\n\n# Split\ntraining_dx = int(len(sents)*90/100)\ntraining = sents[:training_dx]\ntest = sents[training_dx+1:]\n\n#train\nport_tagger = HiddenMarkovModelTagger.train(training)\n'accuracy %.1f %%' % (port_tagger.evaluate(test) * 100)\n",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 25,
       "text": "'accuracy 85.5 %'"
      }
     ],
     "prompt_number": 25
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "Cross Language Testing\n======================\n\nLanguage Test data sent from Transifex"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "\nstrings = [dict(\n           en_source=\"Without a secure random number generator an attacker\"\n                     \" may be able to predict password reset tokens and take\"\n                     \" over your account.\",\n           pt=[\n               # portuguese translation 1\n               \"Sem nenhum gerador seguro de n\u00fameros aleat\u00f3rios, uma pessoa mal\"\n               \" intencionada pode prever a sua password, reiniciar as seguran\u00e7as\"\n               \" adicionais e tomar conta da sua conta.\"\n               ],\n           es=[\n               # spanish translation 1\n               \"Sin un generador de n\u00fameros aleatorios seguro, un atacante podr\u00eda\"\n               \" predecir los tokens de restablecimiento de contrase\u00f1as y tomar\"\n               \" el control de su cuenta.\",\n               # spanish translation 2\n               \"Sin un generador de n\u00fameros aleatorios seguro un atacante podr\u00eda\"\n               \" predecir los tokens de reinicio de su contrase\u00f1a y tomar control\"\n               \" de su cuenta.\"\n               \n               ])]",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 26
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "import numpy as np\n\nfor data_dict in strings:\n    en_density = density(\"en\", nltk.pos_tag(tokenize(data_dict['en_source'])))\n    print \"english\", en_density\n    es_densities = np.mean([density(\"es\", spanish_tagger.tag(tokenize(x))) for x in data_dict['es']])\n    print \"spanish\", es_densities\n    pt_densities = np.mean([density(\"pt\", port_tagger.tag(tokenize(x))) for x in data_dict['pt']])\n    print \"portuguese\", pt_densities",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "english 0.590909090909\nspanish"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": " 0.431538461538\nportuguese"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": " 0.379310344828\n"
      }
     ],
     "prompt_number": 28
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": ".59 - .43",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 29,
       "text": "0.15999999999999998"
      }
     ],
     "prompt_number": 29
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": ".59 - .37",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 30,
       "text": "0.21999999999999997"
      }
     ],
     "prompt_number": 30
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": ".43 - .37",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 31,
       "text": "0.06"
      }
     ],
     "prompt_number": 31
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "Other fun stuff\n===============\nhttp://nltk.org/book/\n\n \tNatural Language Processing with Python\n--- Analyzing Text with the Natural Language Toolkit\n\nSteven Bird, Ewan Klein, and Edward Loper\n\n"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "from nltk.book import *",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "*** Introductory Examples for the NLTK Book ***\nLoading text1, ..., text9 and sent1, ..., sent9\nType the name of the text or sentence to view it.\nType: 'texts()' or 'sents()' to list the materials.\ntext1:"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": " Moby Dick by Herman Melville 1851\ntext2:"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": " Sense and Sensibility by Jane Austen 1811\ntext3:"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": " The Book of Genesis\ntext4:"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": " Inaugural Address Corpus\ntext5:"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": " Chat Corpus\ntext6: Monty Python and the Holy Grail\ntext7:"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": " Wall Street Journal\ntext8: Personals Corpus\ntext9:"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": " The Man Who Was Thursday by G . K . Chesterton 1908\n"
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "nltk.book.text4.dispersion_plot([\"citizens\", \"democracy\", \"freedom\", \"duties\", \"America\"])",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 10
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "def tabulate(cfdist, words, categories):\n    print '%-16s' % 'Category',\n    for word in words:                                  # column headings\n        print '%6s' % word,\n    print\n    for category in categories:\n        print '%-16s' % category,                       # row heading\n        for word in words:                              # for each word\n            print '%6d' % cfdist[category][word],       # print table cell\n        print                                           # end the row",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "from nltk.corpus import brown\n\ncfd = nltk.ConditionalFreqDist(\n          (genre, word)\n          for genre in brown.categories()\n          for word in brown.words(categories=genre))\ngenres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']\nmodals = ['can', 'could', 'may', 'might', 'must', 'will']\ntabulate(cfd, modals, genres)",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "Category            can  could    may  might   must   will\nnews                 93     86     66     38     50    389\nreligion             82     59     78     12     54     71\nhobbies             268     58    131     22     83    264\nscience_fiction      16     49      4     12      8     16\nromance              74    193     11     51     45     43\nhumor                16     30      8      8      9     13\n"
      }
     ],
     "prompt_number": 13
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())\ntext.similar('woman')",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "Building word-context index...\nman day time year car moment world family house boy child country job\nstate girl place war way case question"
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "\n"
      }
     ],
     "prompt_number": 14
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "text.similar('bought')",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "made done put said found had seen given left heard been brought got\nset was called felt in that told\n"
      }
     ],
     "prompt_number": 15
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": "grammer and logic"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "v = \"\"\"\nbertie => b\nolive => o\ncyril => c\nboy => {b}\ngirl => {o}\ndog => {c}\nwalk => {o, c}\nsee => {(b, o), (c, b), (o, c)}\n\"\"\"\nval = nltk.parse_valuation(v)\ng = nltk.Assignment(val.domain)\nm = nltk.Model(val.domain, val)\nsent = 'Cyril sees every boy'\ngrammar_file = 'grammars/book_grammars/simple-sem.fcfg'\nresults = nltk.batch_evaluate([sent], grammar_file, m, g)[0]\nfor (syntree, semrep, value) in results:\n    print semrep\n    print value",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "all z1.(boy(z1) -> see(cyril,z1))\nTrue\n"
      }
     ],
     "prompt_number": 16
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "sent = 'Cyril sees a boy'\nresults = nltk.batch_evaluate([sent], grammar_file, m, g)[0]\nfor (syntree, semrep, value) in results:\n    print semrep\n    print value",
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": "exists z2.(boy(z2) & see(cyril,z2))\nTrue\n"
      }
     ],
     "prompt_number": 17
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "nltk.download()",
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": "*"
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": "",
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}