{ "metadata": { "name": "NPL ChiPy Jan 2014" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": "Goals of this talk\n==================\n\n* Explore NLTK\n* Have fun thinking about Lexical Graphs\n* Explore the posiblity of comparing differences between langaugas using NLP\n\n" }, { "cell_type": "code", "collapsed": false, "input": "import nltk", "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "markdown", "metadata": {}, "source": "nltk ships with a download utility for downloading grammers and corpora." }, { "cell_type": "code", "collapsed": false, "input": "nltk.download()", "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": "showing info http://nltk.github.com/nltk_data/\n" }, { "metadata": {}, "output_type": "pyout", "prompt_number": 2, "text": "True" } ], "prompt_number": 2 }, { "cell_type": "markdown", "metadata": {}, "source": "Density\n=======" }, { "cell_type": "code", "collapsed": false, "input": "from nltk import word_tokenize as tokenize", "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": "nltk.pos_tag(tokenize(\"The quick brown fox jumps over the lazy dog.\"))", "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 4, "text": "[('The', 'DT'),\n ('quick', 'NN'),\n ('brown', 'NN'),\n ('fox', 'NN'),\n ('jumps', 'NNS'),\n ('over', 'IN'),\n ('the', 'DT'),\n ('lazy', 'NN'),\n ('dog', 'NN'),\n ('.', '.')]" } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": "nltk.pos_tag(tokenize(\"If I were you I wouldn't do that with these.\"))", "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 5, "text": "[('If', 'IN'),\n ('I', 'PRP'),\n ('were', 'VBD'),\n ('you', 'PRP'),\n ('I', 'PRP'),\n ('would', 'MD'),\n (\"n't\", 'RB'),\n ('do', 'VB'),\n ('that', 'DT'),\n ('with', 'IN'),\n ('these', 'DT'),\n ('.', '.')]" } ], "prompt_number": 5 }, { "cell_type": "markdown", "metadata": {}, "source": "Create a density checker" }, { "cell_type": "code", "collapsed": false, "input": "import re\n\nmatches = lambda x, re_parts: any([re.findall(y, x) for y in re_parts])\n\ndef density(lang_code, data):\n if lang_code == 'en':\n content_tags = (\"^NN\", \"^JJ\", \"^V\") \n elif lang_code == 'pt':\n content_tags = (\"\\+n\", \"\\+adj\", \"\\+v\")\n elif lang_code == \"es\":\n content_tags = (\"^v\", \"^a\", \"^n\")\n return len(filter(lambda x: matches(x[1], content_tags), data)) / float(len(data))\n\nassert density('en', [(1, \"NN\"), (2, \"XX\")]) == .5\nassert matches(\"H+n\", (\"n\", \"adj\"))\nassert density('pt', [(1, \"H+n\"), (2, \"H+xxx\")]) == .5\nassert density('es', [(1, \"xxxx\"), (2, \"vmip3s0\")]) == .5\n", "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "markdown", "metadata": {}, "source": "back to our original example, one phrase is more dense than the other." }, { "cell_type": "code", "collapsed": false, "input": "assert density(\"en\", nltk.pos_tag(tokenize(\"If I were you I wouldn\u2019t do that with these.\"))) \\\n < density(\"en\", nltk.pos_tag(tokenize(\"The quick brown fox jumps over the lazy dog.\")))\n", "language": "python", "metadata": {}, "outputs": [], "prompt_number": 19 }, { "cell_type": "markdown", "metadata": {}, "source": "Tag Spanish Text\n================" }, { "cell_type": "code", "collapsed": false, "input": "from nltk.corpus import cess_esp\nsents = cess_esp.tagged_sents()", "language": "python", "metadata": {}, "outputs": [], "prompt_number": 20 }, { "cell_type": "markdown", "metadata": {}, "source": "Split into training and test set" }, { "cell_type": "code", "collapsed": false, "input": "training_dx = int(len(sents)*90/100)\ntraining = sents[:training_dx]\ntest = sents[training_dx+1:]", "language": "python", "metadata": {}, "outputs": [], "prompt_number": 21 }, { "cell_type": "markdown", "metadata": {}, "source": "train tagger and check accuracy (this takes 40 seconds or so) ..." }, { "cell_type": "code", "collapsed": false, "input": "from nltk import HiddenMarkovModelTagger\nspanish_tagger = HiddenMarkovModelTagger.train(training)\n'accuracy %.1f %%' % (spanish_tagger.evaluate(test) * 100)", "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 22, "text": "'accuracy 84.9 %'" } ], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": "spanish_tagger.tag(tokenize(\"A buen entendedor, pocas palabras bastan.\"))", "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 23, "text": "[('A', 'sps00'),\n ('buen', 'aq0ms0'),\n ('entendedor', 'np0000p'),\n (',', 'Fc'),\n ('pocas', 'di0fp0'),\n ('palabras', 'ncfp000'),\n ('bastan', 'aq0fp0'),\n ('.', 'Fp')]" } ], "prompt_number": 23 }, { "cell_type": "code", "collapsed": false, "input": "spanish_tagger.tag(tokenize(\"El gato blanco se sent\u00f3 en la alfombra.\"))", "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 24, "text": "[('El', 'da0ms0'),\n ('gato', 'ncms000'),\n ('blanco', 'aq0ms0'),\n ('se', 'p0300000'),\n ('sent\\xc3\\xb3', 'vmip3s0'),\n ('en', 'sps00'),\n ('la', 'da0fs0'),\n ('alfombra', 'ncfs000'),\n ('.', 'Fp')]" } ], "prompt_number": 24 }, { "cell_type": "markdown", "metadata": {}, "source": "Now Portuguese " }, { "cell_type": "code", "collapsed": false, "input": "from nltk.corpus import floresta\nsents = floresta.tagged_sents()\n\n# Split\ntraining_dx = int(len(sents)*90/100)\ntraining = sents[:training_dx]\ntest = sents[training_dx+1:]\n\n#train\nport_tagger = HiddenMarkovModelTagger.train(training)\n'accuracy %.1f %%' % (port_tagger.evaluate(test) * 100)\n", "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 25, "text": "'accuracy 85.5 %'" } ], "prompt_number": 25 }, { "cell_type": "markdown", "metadata": {}, "source": "Cross Language Testing\n======================\n\nLanguage Test data sent from Transifex" }, { "cell_type": "code", "collapsed": false, "input": "\nstrings = [dict(\n en_source=\"Without a secure random number generator an attacker\"\n \" may be able to predict password reset tokens and take\"\n \" over your account.\",\n pt=[\n # portuguese translation 1\n \"Sem nenhum gerador seguro de n\u00fameros aleat\u00f3rios, uma pessoa mal\"\n \" intencionada pode prever a sua password, reiniciar as seguran\u00e7as\"\n \" adicionais e tomar conta da sua conta.\"\n ],\n es=[\n # spanish translation 1\n \"Sin un generador de n\u00fameros aleatorios seguro, un atacante podr\u00eda\"\n \" predecir los tokens de restablecimiento de contrase\u00f1as y tomar\"\n \" el control de su cuenta.\",\n # spanish translation 2\n \"Sin un generador de n\u00fameros aleatorios seguro un atacante podr\u00eda\"\n \" predecir los tokens de reinicio de su contrase\u00f1a y tomar control\"\n \" de su cuenta.\"\n \n ])]", "language": "python", "metadata": {}, "outputs": [], "prompt_number": 26 }, { "cell_type": "code", "collapsed": false, "input": "import numpy as np\n\nfor data_dict in strings:\n en_density = density(\"en\", nltk.pos_tag(tokenize(data_dict['en_source'])))\n print \"english\", en_density\n es_densities = np.mean([density(\"es\", spanish_tagger.tag(tokenize(x))) for x in data_dict['es']])\n print \"spanish\", es_densities\n pt_densities = np.mean([density(\"pt\", port_tagger.tag(tokenize(x))) for x in data_dict['pt']])\n print \"portuguese\", pt_densities", "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": "english 0.590909090909\nspanish" }, { "output_type": "stream", "stream": "stdout", "text": " 0.431538461538\nportuguese" }, { "output_type": "stream", "stream": "stdout", "text": " 0.379310344828\n" } ], "prompt_number": 28 }, { "cell_type": "code", "collapsed": false, "input": ".59 - .43", "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 29, "text": "0.15999999999999998" } ], "prompt_number": 29 }, { "cell_type": "code", "collapsed": false, "input": ".59 - .37", "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 30, "text": "0.21999999999999997" } ], "prompt_number": 30 }, { "cell_type": "code", "collapsed": false, "input": ".43 - .37", "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 31, "text": "0.06" } ], "prompt_number": 31 }, { "cell_type": "markdown", "metadata": {}, "source": "Other fun stuff\n===============\nhttp://nltk.org/book/\n\n \tNatural Language Processing with Python\n--- Analyzing Text with the Natural Language Toolkit\n\nSteven Bird, Ewan Klein, and Edward Loper\n\n" }, { "cell_type": "code", "collapsed": false, "input": "from nltk.book import *", "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": "*** Introductory Examples for the NLTK Book ***\nLoading text1, ..., text9 and sent1, ..., sent9\nType the name of the text or sentence to view it.\nType: 'texts()' or 'sents()' to list the materials.\ntext1:" }, { "output_type": "stream", "stream": "stdout", "text": " Moby Dick by Herman Melville 1851\ntext2:" }, { "output_type": "stream", "stream": "stdout", "text": " Sense and Sensibility by Jane Austen 1811\ntext3:" }, { "output_type": "stream", "stream": "stdout", "text": " The Book of Genesis\ntext4:" }, { "output_type": "stream", "stream": "stdout", "text": " Inaugural Address Corpus\ntext5:" }, { "output_type": "stream", "stream": "stdout", "text": " Chat Corpus\ntext6: Monty Python and the Holy Grail\ntext7:" }, { "output_type": "stream", "stream": "stdout", "text": " Wall Street Journal\ntext8: Personals Corpus\ntext9:" }, { "output_type": "stream", "stream": "stdout", "text": " The Man Who Was Thursday by G . K . Chesterton 1908\n" } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": "nltk.book.text4.dispersion_plot([\"citizens\", \"democracy\", \"freedom\", \"duties\", \"America\"])", "language": "python", "metadata": {}, "outputs": [], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": "def tabulate(cfdist, words, categories):\n print '%-16s' % 'Category',\n for word in words: # column headings\n print '%6s' % word,\n print\n for category in categories:\n print '%-16s' % category, # row heading\n for word in words: # for each word\n print '%6d' % cfdist[category][word], # print table cell\n print # end the row", "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": "from nltk.corpus import brown\n\ncfd = nltk.ConditionalFreqDist(\n (genre, word)\n for genre in brown.categories()\n for word in brown.words(categories=genre))\ngenres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']\nmodals = ['can', 'could', 'may', 'might', 'must', 'will']\ntabulate(cfd, modals, genres)", "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": "Category can could may might must will\nnews 93 86 66 38 50 389\nreligion 82 59 78 12 54 71\nhobbies 268 58 131 22 83 264\nscience_fiction 16 49 4 12 8 16\nromance 74 193 11 51 45 43\nhumor 16 30 8 8 9 13\n" } ], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": "text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())\ntext.similar('woman')", "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": "Building word-context index...\nman day time year car moment world family house boy child country job\nstate girl place war way case question" }, { "output_type": "stream", "stream": "stdout", "text": "\n" } ], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": "text.similar('bought')", "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": "made done put said found had seen given left heard been brought got\nset was called felt in that told\n" } ], "prompt_number": 15 }, { "cell_type": "markdown", "metadata": {}, "source": "grammer and logic" }, { "cell_type": "code", "collapsed": false, "input": "v = \"\"\"\nbertie => b\nolive => o\ncyril => c\nboy => {b}\ngirl => {o}\ndog => {c}\nwalk => {o, c}\nsee => {(b, o), (c, b), (o, c)}\n\"\"\"\nval = nltk.parse_valuation(v)\ng = nltk.Assignment(val.domain)\nm = nltk.Model(val.domain, val)\nsent = 'Cyril sees every boy'\ngrammar_file = 'grammars/book_grammars/simple-sem.fcfg'\nresults = nltk.batch_evaluate([sent], grammar_file, m, g)[0]\nfor (syntree, semrep, value) in results:\n print semrep\n print value", "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": "all z1.(boy(z1) -> see(cyril,z1))\nTrue\n" } ], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": "sent = 'Cyril sees a boy'\nresults = nltk.batch_evaluate([sent], grammar_file, m, g)[0]\nfor (syntree, semrep, value) in results:\n print semrep\n print value", "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": "exists z2.(boy(z2) & see(cyril,z2))\nTrue\n" } ], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": "nltk.download()", "language": "python", "metadata": {}, "outputs": [], "prompt_number": "*" }, { "cell_type": "code", "collapsed": false, "input": "", "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }