{ "metadata": { "name": "", "signature": "sha256:aba4639823d41a59be27fb05e556172d6bf24db95c4ab2299c7798b70a0f11cb" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Visualizing a CKY parser with IPythonBlocks" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import nltk\n", "from ipythonblocks import BlockGrid, colors\n", "# import time\n", "# from IPython.display import clear_output\n", "\n", "def print_chart(chart, num_of_tabs=1):\n", " \"\"\"\n", " Nicht notwendige, selbstgebastelte Funktion zum Drucken von Charts.\n", " Wandelt Sets (aus optischen Gr\u00fcnden) in Listen um und kann diese\n", " einger\u00fcckt (mit Tabulator) darstellen.\n", " \"\"\"\n", " tabstring = '\\t' * num_of_tabs\n", " for row in chart:\n", " print tabstring+\"\\t\".join([str(list(element)) for element in row])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "words = [\"I\", \"shot\", \"an\", \"elephant\", \"in\", \"my\", \"pajamas\"]\n", "n = len(words)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "grammar = nltk.parse_cfg(\"\"\"\n", " S -> NP VP\n", " PP -> P NP\n", " NP -> Det N | 'I' | NP PP\n", " VP -> V NP | VP PP\n", " Det -> 'an' | 'my'\n", " N -> 'elephant' | 'pajamas'\n", " V -> 'shot'\n", " P -> 'in'\n", " \"\"\")" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "# leere Chart bauen\n", "chart = []\n", "for i in range(n+1):\n", " # Erzeuge eine leere Zeile\n", " row = []\n", " # Haenge n+1 leere Mengen (set()) an\n", " for j in range(n+1):\n", " row.append(set())\n", " # Haenge die neue Zeile an die Chart an\n", " chart.append(row)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "grid = BlockGrid(n+1, n+1)\n", "known_terminals = set()\n", "\n", "# Terminalproduktionen\n", "# Iteriere ueber alle Positionen des Eingabesatzes\n", "for i in range(n):\n", " # Iteriere ueber alle Regeln der Grammatik, die das Wort words[i] als rechte Seite haben\n", " # z.B. words[4] = 'elephant' und N -> 'elephant'\n", " print \"productions for terminal '{}':\".format(words[i])\n", " for prod in grammar.productions(rhs=words[i]):\n", " print prod\n", " # Fuege in die Zelle fuer die Spanne von i bis i+1 die linke Regel (z.B. N) ein\n", " print \"\\t{0} added to chart[{1}][{2}]\\n\".format(prod.lhs(), i, i+1)\n", " grid[i, i+1] = colors['White']\n", " grid.flash(display_time=1.0)\n", " chart[i][i+1].add(prod.lhs())\n", " known_terminals.add((i, i+1))\n", " \n", "grid.show()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
" ], "metadata": {}, "output_type": "display_data", "text": [ "" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "grid = BlockGrid(n+1, n+1)\n", "seconds_per_step = 1.0\n", "known_lhs = set()\n", "\n", "# Iteriere ueber alle Spannen ab Laenge 2\n", "for width in range(2, n+1):\n", " # Iteriere ueber alle moeglichen Anfaenge einer Spanne\n", " for i in range(0, n-width+1): # i + width <= n\n", " # Iteriere ueber alle moeglichen Teilungspunkte dieser Spanne\n", " for j in range(1, width): # 1 <= j <= width-1\n", " # Jetzt wollen wir Nichtterminale aus 2 gegebenen Chartzellen kombinieren\n", " nts1 = chart[i][i+j]\n", " grid[i, i+j] = colors['Yellow'] # Nichtterminal 1: gelb\n", " \n", " # j ist der Teilungspunkt: eine Spanne geht von i bis i+j, die andere von\n", " # i+j bis i+width\n", " nts2 = chart[i+j][i+width]\n", " grid[i+j, i+width] = colors['Orange'] # Nichtterminal 2: orange\n", " \n", " print (\"\\ntry to combine nonterminals from [{0}][{1}] ({2})\"\n", " \" and [{3}][{4}] ({5})\".format(i, i+j, list(nts1), i+j, i+width, list(nts2))) \n", " \n", " # Iteriere nun ueber alle Nichtterminale der ersten Zelle\n", " for nt1 in nts1:\n", " # Bestimme die Kandidatenregeln, die ein Nichtterminal aus nts1 als\n", " # erstes Element auf ihrer rechten Regelseite haben\n", " # Beispiel: nts1 enthaelt NP => suche alle Regeln der Form X => NP Y,\n", " # z.B. S -> NP VP\n", " productions = grammar.productions(rhs=nt1)\n", " if productions:\n", " print \"\\tproductions, where NT '{0}' is 1st element of RHS: {1}\".format(nt1, productions)\n", " else:\n", " print \"\\tNO productions where nonterminal '{}' is the 1st element of RHS!\".format(nt1)\n", " # Gehe nun diese Kandidatenregeln durch\n", " for production in productions:\n", " # Pruefe, ob das 2. Element der rechten Regelseite (production.rhs()[1]),\n", " # (also im Beispiel oben VP) im anderen Kaestchen enthalten ist\n", " if production.rhs()[1] in nts2:\n", " # ok, fuege dann die linke Regelseite (also S) in das Zielkaestchen\n", " # chart[i][i+width] ein. Dieses enthaelt alle Nichtterminale, die die\n", " # Eingabe zwischen den Positionen i und i+width ableitet.\n", " print (\"\\t\\t2nd element of production RHS '{0}' is also in '{1}'\"\n", " \"\\n\\t\\tLHS will be added to [{2}][{3}]\".format(production, list(nts2), i, i+width))\n", " chart[i][i+width].add(production.lhs())\n", " grid[i, i+width] = colors['Green']\n", " known_lhs.add((i, i+width)) # wir merken uns schon gefundene LHS\n", " else:\n", " print \"\\t\\t2nd element of production RHS '{0}' NOT in '{1}'!\".format(production, list(nts2))\n", "\n", " grid.flash(display_time=seconds_per_step) # Nichtterminale aufleuchten f\u00fcr X Sekunden\n", " # Farben zuruecksetzen\n", " for (row, column) in ((i, i+j), (i+j, i+width)):\n", " if (row, column) in known_terminals:\n", " grid[row, column] = colors['White'] # wenn schon ein Nichtterminal in diesem K\u00e4stchen gespeichert ist\n", " elif (row, column) in known_lhs:\n", " grid[row, column] = colors['Green'] # wenn schon eine RHS in diesem K\u00e4stchen gespeichert ist\n", " else:\n", " grid[row, column] = colors['Black']\n", " \n", "grid.show()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
" ], "metadata": {}, "output_type": "display_data", "text": [ "" ] } ], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "print_chart(chart)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\t[]\t[NP]\t[]\t[]\t[S]\t[]\t[]\t[S]\n", "\t[]\t[]\t[V]\t[]\t[VP]\t[]\t[]\t[VP]\n", "\t[]\t[]\t[]\t[Det]\t[NP]\t[]\t[]\t[NP]\n", "\t[]\t[]\t[]\t[]\t[N]\t[]\t[]\t[]\n", "\t[]\t[]\t[]\t[]\t[]\t[P]\t[]\t[PP]\n", "\t[]\t[]\t[]\t[]\t[]\t[]\t[Det]\t[NP]\n", "\t[]\t[]\t[]\t[]\t[]\t[]\t[]\t[N]\n", "\t[]\t[]\t[]\t[]\t[]\t[]\t[]\t[]\n" ] } ], "prompt_number": 16 } ], "metadata": {} } ] }