{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from nltk.corpus.reader import CategorizedPlaintextCorpusReader\n", "reader = CategorizedPlaintextCorpusReader('/media/storage/dpla-data/new/words-6mill/colls/', r'.*\\.txt', cat_pattern=r'(\\w+)\\.txt')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['gpo']" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "reader.categories('gpo.txt')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "gpowords = reader.words('gpo.txt')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "36646650" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(gpowords)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "438479" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(set([w.lower() for w in reader.words('gpo.txt')]))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "ename": "ImportError", "evalue": "No module named 'nlt'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mImportError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0mnlt\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;31mImportError\u001b[0m: No module named 'nlt'" ] } ], "source": [ "import nlt" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "cfd = nltk.ConditionalFreqDist(\n", "(genre, word)\n", "for genre in ['gpo', 'artstor']\n", "for word in reader.words(categories=genre))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "ename": "AttributeError", "evalue": "'ConditionalFreqDist' object has no attribute 'most_common'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mcfd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmost_common\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m: 'ConditionalFreqDist' object has no attribute 'most_common'" ] } ], "source": [ "cfd.most_common()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fd = nltk.FreqDist((word) for word in reader.words(categories='gpo'))\n", "#fd.most_common()\n", "fd[1]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "ename": "UnicodeDecodeError", "evalue": "'ascii' codec can't decode byte 0xb0 in position 0: ordinal not in range(128)", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mUnicodeDecodeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mfd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mplot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;32m/home/charper/anaconda/lib/python2.7/site-packages/nltk/probability.pyc\u001b[0m in \u001b[0;36mplot\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 313\u001b[0m \u001b[1;32mdel\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"title\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 314\u001b[0m \u001b[0mpylab\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mplot\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfreqs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 315\u001b[1;33m \u001b[0mpylab\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mxticks\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msamples\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0municode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ms\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0ms\u001b[0m \u001b[1;32min\u001b[0m \u001b[0msamples\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrotation\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m90\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 316\u001b[0m \u001b[0mpylab\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mxlabel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Samples\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 317\u001b[0m \u001b[0mpylab\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mylabel\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mylabel\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mUnicodeDecodeError\u001b[0m: 'ascii' codec can't decode byte 0xb0 in position 0: ordinal not in range(128)" ] }, { "data": { "image/png": [ "iVBORw0KGgoAAAANSUhEUgAAAZ4AAAEACAYAAACDEBA8AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\n", "AAALEgAACxIB0t1+/AAAIABJREFUeJzt3X9slPdhBvDnLHtDVQpH8GqoD8nx/bADdmxvcEYTkea5\n", "vivtAlRgbLaBmzCBPQ2RKkI01dKwSvSgXVQlGUFVB8Kw4aOgBNBUG3sUK6zAZRBQN8hUL1wInA8r\n", "sc/MpjHn4Gd/OLxfiAkQfrw+7p6PhMT7vfd973uPsZ+833vPcZAkREREbJI13hMQEZHMouIRERFb\n", "qXhERMRWKh4REbGVikdERGyl4hEREVvdtniGhoZQWVmJ8vJyzJgxAy+++CIAYP369XC5XKioqEBF\n", "RQVaW1utY0KhELxeL4qLi9He3m6Nnzx5EqWlpfB6vVizZo01fvXqVdTV1cHr9WLOnDk4f/689Vhz\n", "czN8Ph98Ph927NhhjUejUVRWVsLr9aK+vh7Dw8P3n4SIiNiDd3DlyhWS5PDwMCsrK3nkyBGuX7+e\n", "r7zyyph9z5w5w7KyMiaTSUajUbrdbo6MjJAkZ8+ezUgkQpKcN28eW1tbSZKbN29mU1MTSTIcDrOu\n", "ro4k2dvby8LCQiYSCSYSCRYWFrK/v58kWVtby927d5MkGxsbuWXLlju9DBERSRF3XGr7yle+AgBI\n", "JpO4du0aJk+efL2wxuy7f/9+LF26FDk5OSgoKIDH40EkEkE8HsfAwAD8fj8AYPny5di3bx8A4MCB\n", "A2hoaAAALFq0CIcOHQIAHDx4EIFAAE6nE06nEzU1NWhtbQVJHD58GIsXLwYANDQ0WOcSEZHUd8fi\n", "GRkZQXl5OfLy8lBVVYWZM2cCAF5//XWUlZVhxYoV6O/vBwB0d3fD5XJZx7pcLsRisTHj+fn5iMVi\n", "AIBYLIbp06cDALKzszFp0iT09vZ+4bn6+vrgdDqRlZU15lwiIpL67lg8WVlZOH36NC5evIi3334b\n", "nZ2daGpqQjQaxenTpzFt2jS88MILdswVDofDlucREZGHJ/tud5w0aRK+/e1v48SJE/izP/sza/xv\n", "/uZv8MwzzwAYvfq4cOGC9djFixfhcrmQn5+Pixcvjhm/fsyHH36Ir3/96/j0009x+fJlTJkyBfn5\n", "+ejs7LSOuXDhAv78z/8cjz/+OPr7+zEyMoKsrCxcvHgR+fn5Y+br8Xjw/vvv33UQIiICuN1u/O//\n", "/u9DfY7bXvF8/PHH1jLaJ598go6ODlRUVODSpUvWPm+99RZKS0sBAPPnz0c4HEYymUQ0GkVXVxf8\n", "fj+mTp2KiRMnIhKJgCR27tyJBQsWWMc0NzcDAPbu3Yvq6moAQCAQQHt7O/r7+5FIJNDR0YFgMAiH\n", "w4Gqqirs2bMHwOidbwsXLhwz9/fffx8k9YfEyy+/PO5zSJU/ykJZKIvb/7HjP9hve8UTj8fR0NCA\n", "kZERjIyMYNmyZaiursby5ctx+vRpOBwOPPHEE/j5z38OAJgxYwaWLFmCGTNmIDs7G2+88Ya1PPbG\n", "G2/gu9/9Lj755BN861vfwje/+U0AwIoVK7Bs2TJ4vV5MmTIF4XAYAPD444/jpZdewuzZswEAL7/8\n", "MpxOJwBg06ZNqK+vx9///d/jj//4j7FixYqHk06aGBoaGu8ppAxlYSgLQ1nY67bFU1painfffXfM\n", "+I2fqfm8H/zgB/jBD34wZvxP/uRP8F//9V9jxv/wD/8Qv/zlL295rmeffRbPPvvsmPEnnngCkUjk\n", "dlMXEZEUpd9ckAGuX12KsriRsjCUhb0cJNPyfwTncDiQpi9NROShseNnp654MsD1G0REWdxIWRjK\n", "wl4qHhERsZWW2kRExKKlNhERSTsqngyg9WtDWRjKwlAW9lLxiIiIrfQej4iIWPQej4iIpB0VTwbQ\n", "+rWhLAxlYSgLe6l4RETEVnqPR0RELHqPR0RE0o6KJwNo/dpQFoayMJSFvVQ8IiJiK73HIyIiFr3H\n", "IyIiaSeti+fDD8d7BqlB69eGsjCUhaEs7JXWxXPs2HjPQEREPi+ti0dGOZ3O8Z5CylAWhrIwlIW9\n", "VDwiImIrFU8G0Pq1oSwMZWEoC3vdtniGhoZQWVmJ8vJyzJgxAy+++CIAoK+vDzU1NfD5fAgEAjd9\n", "0UKhELxeL4qLi9He3m6Nnzx5EqWlpfB6vVizZo01fvXqVdTV1cHr9WLOnDk4f/689VhzczN8Ph98\n", "Ph927NhhjUejUVRWVsLr9aK+vh7Dw8P3n4SIiNiDd3DlyhWS5PDwMCsrK3nkyBGuXbuWmzZtIklu\n", "3LiR69atI0meOXOGZWVlTCaTjEajdLvdHBkZIUnOnj2bkUiEJDlv3jy2traSJDdv3sympiaSZDgc\n", "Zl1dHUmyt7eXhYWFTCQSTCQSLCwsZH9/P0mytraWu3fvJkk2NjZyy5YtY+YNgOHwnV6diIjc6C5q\n", "4b7dcantK1/5CgAgmUzi2rVrmDx5Mg4cOICGhgYAQENDA/bt2wcA2L9/P5YuXYqcnBwUFBTA4/Eg\n", "EokgHo9jYGAAfr8fALB8+XLrmBvPtWjRIhw6dAgAcPDgQQQCATidTjidTtTU1KC1tRUkcfjwYSxe\n", "vHjM84uISOq7Y/GMjIygvLwceXl5qKqqwsyZM9HT04O8vDwAQF5eHnp6egAA3d3dcLlc1rEulwux\n", "WGzMeH5+PmKxGAAgFoth+vTpAIDs7GxMmjQJvb29X3iuvr4+OJ1OZGVljTmX3JrWrw1lYSgLQ1nY\n", "K/tOO2RlZeH06dO4fPkygsEgDh8+fNPjDocDDofjoU3w88/1Zbz55vfx3nsTAACzZs3C3Llzrdsm\n", "r/9D03ZmbV+XKvMZz+3BwcGUms94bg8ODqbUfOzc7uzsRFtbGwBgwoTRn5cP3ZdZl/vRj37En/70\n", "pywqKmI8HidJdnd3s6ioiCQZCoUYCoWs/YPBII8fP854PM7i4mJrfNeuXWxsbLT2OXbsGMnR95Fy\n", "c3NJki0tLVy1apV1zMqVKxkOhzkyMsLc3Fxeu3aNJHn06FEGg8Exc4Xe4xER+dK+ZC3ck9sutX38\n", "8cdWM37yySfo6OhARUUF5s+fj+bmZgCjd54tXLgQADB//nyEw2Ekk0lEo1F0dXXB7/dj6tSpmDhx\n", "IiKRCEhi586dWLBggXXM9XPt3bsX1dXVAIBAIID29nb09/cjkUigo6MDwWAQDocDVVVV2LNnz5jn\n", "FxGRR8DtWum3v/0tKyoqWFZWxtLSUv7kJz8hOXrHWXV1Nb1eL2tqaphIJKxjNmzYQLfbzaKiIra1\n", "tVnjJ06cYElJCd1uN1evXm2NDw0Nsba2lh6Ph5WVlYxGo9Zj27Zto8fjocfj4fbt263xc+fO0e/3\n", "0+PxcMmSJUwmk2PmDl3xWG78+mQ6ZWEoC0NZGHeohQcirf+3COEwUVc33jMZf/39/fqVIJ9RFoay\n", "MJSFof8tgjwQ+oYylIWhLAxlYS8Vj4iI2ErFkwH0GQVDWRjKwlAW9lLxiIiIrVQ8GUDr14ayMJSF\n", "oSzspeIRERFbqXgygNavDWVhKAtDWdhLxSMiIrZS8WQArV8bysJQFoaysJeKR0REbKXiyQBavzaU\n", "haEsDGVhLxWPiIjYSsWTAbR+bSgLQ1kYysJeKh4REbGViicDaP3aUBaGsjCUhb1UPCIiYisVTwbQ\n", "+rWhLAxlYSgLe6l4RETEViqeDKD1a0NZGMrCUBb2UvGIiIitVDwZQOvXhrIwlIWhLOyl4hEREVup\n", "eDKA1q8NZWEoC0NZ2Ou2xXPhwgVUVVVh5syZKCkpwWuvvQYAWL9+PVwuFyoqKlBRUYHW1lbrmFAo\n", "BK/Xi+LiYrS3t1vjJ0+eRGlpKbxeL9asWWONX716FXV1dfB6vZgzZw7Onz9vPdbc3Ayfzwefz4cd\n", "O3ZY49FoFJWVlfB6vaivr8fw8PD9JyEiIvbgbcTjcZ46dYokOTAwQJ/Px7Nnz3L9+vV85ZVXxux/\n", "5swZlpWVMZlMMhqN0u12c2RkhCQ5e/ZsRiIRkuS8efPY2tpKkty8eTObmppIkuFwmHV1dSTJ3t5e\n", "FhYWMpFIMJFIsLCwkP39/STJ2tpa7t69myTZ2NjILVu2jJkLAIbDt3t1IiLyeXeohQfitlc8U6dO\n", "RXl5OQDgsccew5NPPolYLHa9sMbsv3//fixduhQ5OTkoKCiAx+NBJBJBPB7HwMAA/H4/AGD58uXY\n", "t28fAODAgQNoaGgAACxatAiHDh0CABw8eBCBQABOpxNOpxM1NTVobW0FSRw+fBiLFy8GADQ0NFjn\n", "EhGR1HfX7/F88MEHOHXqFObMmQMAeP3111FWVoYVK1ZY66Pd3d1wuVzWMS6XC7FYbMx4fn6+VWCx\n", "WAzTp08HAGRnZ2PSpEno7e39wnP19fXB6XQiKytrzLnk1rR+bSgLQ1kYysJe2Xez0+DgIBYvXoxX\n", "X30Vjz32GJqamvDDH/4QAPDSSy/hhRdewNatWx/qRAHA4XB8qf3ffPP7eO+9CQCAWbNmYe7cudZt\n", "k9f/oWk7s7avS5X5jOf24OBgSs1nPLcHBwdTaj52bnd2dqKtrQ0AMGHC6M/Lh+5Oa3HJZJKBQIA/\n", "+9nPbvl4NBplSUkJSTIUCjEUClmPBYNBHj9+nPF4nMXFxdb4rl272NjYaO1z7NgxkuTw8DBzc3NJ\n", "ki0tLVy1apV1zMqVKxkOhzkyMsLc3Fxeu3aNJHn06FEGg8Ex84Le4xER+dLuohbu222X2khixYoV\n", "mDFjBp5//nlrPB6PW39/6623UFpaCgCYP38+wuEwkskkotEourq64Pf7MXXqVEycOBGRSAQksXPn\n", "TixYsMA6prm5GQCwd+9eVFdXAwACgQDa29vR39+PRCKBjo4OBINBOBwOVFVVYc+ePQBG73xbuHDh\n", "AytiERF5yG7XSkeOHKHD4WBZWRnLy8tZXl7OX/3qV1y2bBlLS0v51FNPccGCBbx06ZJ1zIYNG+h2\n", "u1lUVMS2tjZr/MSJEywpKaHb7ebq1aut8aGhIdbW1tLj8bCyspLRaNR6bNu2bfR4PPR4PNy+fbs1\n", "fu7cOfr9fno8Hi5ZsoTJZHLM3KErHksikRjvKaQMZWEoC0NZGHeohQfC8dkTpR2Hw4FwmKirG++Z\n", "jL/+/n79SpDPKAtDWRjKwnA4HLe8a/lB0m8uyAD6hjKUhaEsDGVhLxWPiIjYSsWTAfQZBUNZGMrC\n", "UBb2UvGIiIitVDwZQOvXhrIwlIWhLOyl4hEREVupeDKA1q8NZWEoC0NZ2EvFIyIitlLxZACtXxvK\n", "wlAWhrKwl4pHRERspeLJAFq/NpSFoSwMZWEvFY+IiNhKxZMBtH5tKAtDWRjKwl4qHhERsZWKJwNo\n", "/dpQFoayMJSFvVQ8IiJiKxVPBtD6taEsDGVhKAt7qXhERMRWKp4MoPVrQ1kYysJQFvZS8YiIiK1U\n", "PBlA69eGsjCUhaEs7KXiERERW6l4MoDWrw1lYSgLQ1nY67bFc+HCBVRVVWHmzJkoKSnBa6+9BgDo\n", "6+tDTU0NfD4fAoHATV+0UCgEr9eL4uJitLe3W+MnT55EaWkpvF4v1qxZY41fvXoVdXV18Hq9mDNn\n", "Ds6fP2891tzcDJ/PB5/Phx07dljj0WgUlZWV8Hq9qK+vx/Dw8P0nISIi9uBtxONxnjp1iiQ5MDBA\n", "n8/Hs2fPcu3atdy0aRNJcuPGjVy3bh1J8syZMywrK2MymWQ0GqXb7ebIyAhJcvbs2YxEIiTJefPm\n", "sbW1lSS5efNmNjU1kSTD4TDr6upIkr29vSwsLGQikWAikWBhYSH7+/tJkrW1tdy9ezdJsrGxkVu2\n", "bBkzdwAMh2/36kRE5PPuUAsPxG2veKZOnYry8nIAwGOPPYYnn3wSsVgMBw4cQENDAwCgoaEB+/bt\n", "AwDs378fS5cuRU5ODgoKCuDxeBCJRBCPxzEwMAC/3w8AWL58uXXMjedatGgRDh06BAA4ePAgAoEA\n", "nE4nnE4nampq0NraCpI4fPgwFi9ePOb5RUQk9d31ezwffPABTp06hcrKSvT09CAvLw8AkJeXh56e\n", "HgBAd3c3XC6XdYzL5UIsFhsznp+fj1gsBgCIxWKYPn06ACA7OxuTJk1Cb2/vF56rr68PTqcTWVlZ\n", "Y84lt6b1a0NZGMrCUBb2yr6bnQYHB7Fo0SK8+uqr+OpXv3rTYw6HAw6H46FM7vO+7PO8+eb38d57\n", "EwAAs2bNwty5c63bJq//Q9N2Zm1flyrzGc/twcHBlJrPeG4PDg6m1Hzs3O7s7ERbWxsAYMKE0Z+X\n", "D92d1uKSySQDgQB/9rOfWWNFRUWMx+Mkye7ubhYVFZEkQ6EQQ6GQtV8wGOTx48cZj8dZXFxsje/a\n", "tYuNjY3WPseOHSNJDg8PMzc3lyTZ0tLCVatWWcesXLmS4XCYIyMjzM3N5bVr10iSR48eZTAYHDNv\n", "6D0eEZEv7S5q4b7ddqmNJFasWIEZM2bg+eeft8bnz5+P5uZmAKN3ni1cuNAaD4fDSCaTiEaj6Orq\n", "gt/vx9SpUzFx4kREIhGQxM6dO7FgwYIx59q7dy+qq6sBAIFAAO3t7ejv70cikUBHRweCwSAcDgeq\n", "qqqwZ8+eMc8vIiKPgNu10pEjR+hwOFhWVsby8nKWl5eztbWVvb29rK6uptfrZU1NDROJhHXMhg0b\n", "6Ha7WVRUxLa2Nmv8xIkTLCkpodvt5urVq63xoaEh1tbW0uPxsLKyktFo1Hps27Zt9Hg89Hg83L59\n", "uzV+7tw5+v1+ejweLlmyhMlkcszcoSsey41fn0ynLAxlYSgL4w618EA4PnuitONwOBAOE3V14z2T\n", "8dff369fCfIZZWEoC0NZGA6HAw+7FvSbCzKAvqEMZWEoC0NZ2EvFIyIitlLxZAB9RsFQFoayMJSF\n", "vVQ8IiJiKxVPBtD6taEsDGVhKAt7qXhERMRWKp4MoPVrQ1kYysJQFvZS8YiIiK1UPBlA69eGsjCU\n", "haEs7KXiERERW6l4MoDWrw1lYSgLQ1nYS8UjIiK2UvFkAK1fG8rCUBaGsrCXikdERGyl4skAWr82\n", "lIWhLAxlYS8Vj4iI2ErFkwG0fm0oC0NZGMrCXioeERGxlYonA2j92lAWhrIwlIW9VDwiImIrFU8G\n", "0Pq1oSwMZWEoC3upeERExFYqngyg9WtDWRjKwlAW9rpj8Tz33HPIy8tDaWmpNbZ+/Xq4XC5UVFSg\n", "oqICra2t1mOhUAherxfFxcVob2+3xk+ePInS0lJ4vV6sWbPGGr969Srq6urg9XoxZ84cnD9/3nqs\n", "ubkZPp8PPp8PO3bssMaj0SgqKyvh9XpRX1+P4eHhe09ARETsxTt4++23+e6777KkpMQaW79+PV95\n", "5ZUx+545c4ZlZWVMJpOMRqN0u90cGRkhSc6ePZuRSIQkOW/ePLa2tpIkN2/ezKamJpJkOBxmXV0d\n", "SbK3t5eFhYVMJBJMJBIsLCxkf38/SbK2tpa7d+8mSTY2NnLLli1j5gKA4fCdXp2IiNzoLmrhvt3x\n", "iufpp5/G5MmTb1VYY8b279+PpUuXIicnBwUFBfB4PIhEIojH4xgYGIDf7wcALF++HPv27QMAHDhw\n", "AA0NDQCARYsW4dChQwCAgwcPIhAIwOl0wul0oqamBq2trSCJw4cPY/HixQCAhoYG61wiIpL67vk9\n", "ntdffx1lZWVYsWKFtT7a3d0Nl8tl7eNyuRCLxcaM5+fnIxaLAQBisRimT58OAMjOzsakSZPQ29v7\n", "hefq6+uD0+lEVlbWmHPJrWn92lAWhrIwlIW9su/loKamJvzwhz8EALz00kt44YUXsHXr1gc6sVtx\n", "OBxfav833/w+3ntvAgBg1qxZmDt3rnXb5PV/aNrOrO3rUmU+47k9ODiYUvMZz+3BwcGUmo+d252d\n", "nWhrawMATJgw+vPyobub9bhoNHrTezxf9FgoFGIoFLIeCwaDPH78OOPxOIuLi63xXbt2sbGx0drn\n", "2LFjJMnh4WHm5uaSJFtaWrhq1SrrmJUrVzIcDnNkZIS5ubm8du0aSfLo0aMMBoNj5gW9xyMi8qXd\n", "ZS3cl3taaovH49bf33rrLeuOt/nz5yMcDiOZTCIajaKrqwt+vx9Tp07FxIkTEYlEQBI7d+7EggUL\n", "rGOam5sBAHv37kV1dTUAIBAIoL29Hf39/UgkEujo6EAwGITD4UBVVRX27NkDYPTOt4ULF95r74qI\n", "iN3u1Ez19fWcNm0ac3Jy6HK5uHXrVi5btoylpaV86qmnuGDBAl66dMnaf8OGDXS73SwqKmJbW5s1\n", "fuLECZaUlNDtdnP16tXW+NDQEGtra+nxeFhZWcloNGo9tm3bNno8Hno8Hm7fvt0aP3fuHP1+Pz0e\n", "D5csWcJkMjlm3tAVjyWRSIz3FFKGsjCUhaEsjLuohfvm+OyJ0o7D4UA4TNTVjfdMxl9/f79+Jchn\n", "lIWhLAxlYTgcjlvetfwg6TcXZAB9QxnKwlAWhrKwl4pHRERspeLJAPqMgqEsDGVhKAt7qXhERMRW\n", "Kp4MoPVrQ1kYysJQFvZS8YiIiK1UPBlA69eGsjCUhaEs7KXiERERW6l4MoDWrw1lYSgLQ1nYS8Uj\n", "IiK2UvFkAK1fG8rCUBaGsrCXikdERGyl4skAWr82lIWhLAxlYS8Vj4iI2ErFkwG0fm0oC0NZGMrC\n", "XioeERGxlYonA2j92lAWhrIwlIW9VDwiImIrFU8G0Pq1oSwMZWEoC3upeERExFYqngyg9WtDWRjK\n", "wlAW9lLxiIiIrdK6eMjxnkFq0Pq1oSwMZWEoC3vdsXiee+455OXlobS01Brr6+tDTU0NfD4fAoHA\n", "TV+0UCgEr9eL4uJitLe3W+MnT55EaWkpvF4v1qxZY41fvXoVdXV18Hq9mDNnDs6fP2891tzcDJ/P\n", "B5/Phx07dljj0WgUlZWV8Hq9qK+vx/Dw8L0nICIitrpj8Tz77LNoa2u7aWzjxo2oqanB7373O1RX\n", "V2Pjxo0AgLNnz2L37t04e/Ys2tra8Ld/+7fgZ5cdTU1N2Lp1K7q6utDV1WWdc+vWrZgyZQq6urrw\n", "ve99D+vWrQMwWm4/+tGP8M477+Cdd97BP/zDP+Dy5csAgHXr1uGFF15AV1cXJk+ejK1bt95y7rri\n", "GaX1a0NZGMrCUBb2umPxPP3005g8efJNYwcOHEBDQwMAoKGhAfv27QMA7N+/H0uXLkVOTg4KCgrg\n", "8XgQiUQQj8cxMDAAv98PAFi+fLl1zI3nWrRoEQ4dOgQAOHjwIAKBAJxOJ5xOJ2pqatDa2gqSOHz4\n", "MBYvXjzm+UVEJPXd03s8PT09yMvLAwDk5eWhp6cHANDd3Q2Xy2Xt53K5EIvFxozn5+cjFosBAGKx\n", "GKZPnw4AyM7OxqRJk9Db2/uF5+rr64PT6URWVtaYc32ernhGaf3aUBaGsjCUhb2y7/cEDocDDofj\n", "Qczlrp7ry9i37/v43e8mAABmzZqFuXPnWpfU1/+haTuztq9LlfmM5/bg4GBKzWc8twcHB1NqPnZu\n", "d3Z2Wm99TJgw+vPyoeNdiEajLCkpsbaLiooYj8dJkt3d3SwqKiJJhkIhhkIha79gMMjjx48zHo+z\n", "uLjYGt+1axcbGxutfY4dO0aSHB4eZm5uLkmypaWFq1atso5ZuXIlw+EwR0ZGmJuby2vXrpEkjx49\n", "ymAwOGbOAPiv/3o3r05ERK67y1q4L/e01DZ//nw0NzcDGL3zbOHChdZ4OBxGMplENBpFV1cX/H4/\n", "pk6diokTJyISiYAkdu7ciQULFow51969e1FdXQ0ACAQCaG9vR39/PxKJBDo6OhAMBuFwOFBVVYU9\n", "e/aMef6xpXovr05ERB6qOzVTfX09p02bxpycHLpcLm7bto29vb2srq6m1+tlTU0NE4mEtf+GDRvo\n", "drtZVFTEtrY2a/zEiRMsKSmh2+3m6tWrrfGhoSHW1tbS4/GwsrKS0WjUemzbtm30eDz0eDzcvn27\n", "NX7u3Dn6/X56PB4uWbKEyWRyzLwB8F/+5cv2cHq68euT6ZSFoSwMZWHcRS3cN8dnT5R2HA4Hdu4k\n", "/vqvx3sm46+/v1+3i35GWRjKwlAWhsPhwMOuhbT+zQUySt9QhrIwlIWhLOyl4hEREVuldfGk5yLi\n", "l6fPKBjKwlAWhrKwV1oXj4iIpJ60Lh5d8YzS+rWhLAxlYSgLe6V18YiISOpJ6+LRFc8orV8bysJQ\n", "FoaysFdaF4+IiKQeFU8G0Pq1oSwMZWEoC3uldfFoqU1EJPWkdfHIKK1fG8rCUBaGsrBXWhePrnhE\n", "RFJPWhePjNL6taEsDGVhKAt7qXhERMRWaV08WmobpfVrQ1kYysJQFvZK6+IREZHUk9bFoyueUVq/\n", "NpSFoSwMZWGvtC4eERFJPWldPLriGaX1a0NZGMrCUBb2SuviERGR1KPiyQBavzaUhaEsDGVhr7Qu\n", "Hi21iYiknrQuHhml9WtDWRjKwlAW9rqv4ikoKMBTTz2FiooK+P1+AEBfXx9qamrg8/kQCARu+oKG\n", "QiF4vV4UFxejvb3dGj958iRKS0vh9XqxZs0aa/zq1auoq6uD1+vFnDlzcP78eeux5uZm+Hw++Hw+\n", "7Nix45bz0xWPiEgK4n0oKChgb2/vTWNr167lpk2bSJIbN27kunXrSJJnzpxhWVkZk8kko9Eo3W43\n", "R0ZGSJKzZ89mJBIhSc6bN4+tra0kyc2bN7OpqYkkGQ6HWVdXR5Ls7e1lYWEhE4kEE4mE9fcbAeDP\n", "f34/r05EJPPcZy3clfteauPnLisOHDiAhoYGAEBDQwP27dsHANi/fz+WLl2KnJwcFBQUwOPxIBKJ\n", "IB6PY2BgwLpiWr58uXXMjedatGgRDh06BAA4ePAgAoEAnE4nnE4nampq0NbWdr8vRUREbHBfxeNw\n", "OPCNb3wDs2bNwi9+8QsAQE9PD/Ly8gAAeXl56OnpAQB0d3fD5XJZx7pcLsRisTHj+fn5iMViAIBY\n", "LIbp06cDALKzszFp0iT09vZ+4bk+T0tto7R+bSgLQ1kYysJe2fdz8G9+8xtMmzYNH330EWpqalBc\n", "XHzT4w6HAw6H474meD/a2r6PeHwCAGDWrFmYO3euddvk9X9o2s6s7etSZT7juT04OJhS8xnP7cHB\n", "wZSaj52Ab3Z2AAAJgElEQVTbnZ2d1orRhAmjPy8fuge1Zrd+/Xr+4z/+I4uKihiPx0mS3d3dLCoq\n", "IkmGQiGGQiFr/2AwyOPHjzMej7O4uNga37VrFxsbG619jh07RpIcHh5mbm4uSbKlpYWrVq2yjlm5\n", "ciXD4fBN8wHALVse1KsTEckMD7AWvtA9L7X9/ve/x8DAAADgypUraG9vR2lpKebPn4/m5mYAo3ee\n", "LVy4EAAwf/58hMNhJJNJRKNRdHV1we/3Y+rUqZg4cSIikQhIYufOnViwYIF1zPVz7d27F9XV1QCA\n", "QCCA9vZ29Pf3I5FIoKOjA8Fg8F5fioiI2Oiel9p6enrwne98BwDw6aef4q/+6q8QCAQwa9YsLFmy\n", "BFu3bkVBQQF++ctfAgBmzJiBJUuWYMaMGcjOzsYbb7xhLcO98cYb+O53v4tPPvkE3/rWt/DNb34T\n", "ALBixQosW7YMXq8XU6ZMQTgcBgA8/vjjeOmllzB79mwAwMsvv3zLTx7rPZ5R/f39+mT2Z5SFoSwM\n", "ZWEvx2eXVmnH4XDgjTeIpqbxnsn40zeVoSwMZWEoC8PhcIy5W/lB028uyAD6hjKUhaEsDGVhr7Qu\n", "nvS8lhMRebSldfHIKH1GwVAWhrIwlIW90rp4dMUjIpJ60rp4ZJTWrw1lYSgLQ1nYS8UjIiK2Suvi\n", "0VLbKK1fG8rCUBaGsrBXWhePiIiknrQuHl3xjNL6taEsDGVhKAt7pXXxiIhI6knr4tEVzyitXxvK\n", "wlAWhrKwV1oXj4iIpB4VTwbQ+rWhLAxlYSgLe6V18WipTUQk9aR18cgorV8bysJQFoaysFdaF4+u\n", "eEREUk9aF4+M0vq1oSwMZWEoC3uldfHoikdEJPWkdfHIKK1fG8rCUBaGsrCXikdERGyV1sWjpbZR\n", "Wr82lIWhLAxlYa+0Lh4REUk9j2zxtLW1obi4GF6vF5s2bbrlPrriGaX1a0NZGMrCUBb2eiSL59q1\n", "a/i7v/s7tLW14ezZs2hpacF777033tNKWf/xH/8x3lNIGcrCUBaGsrDXI1k877zzDjweDwoKCpCT\n", "k4P6+nrs379/vKeVsk6cODHeU0gZysJQFoaysFf2eE/gXsRiMUyfPt3adrlciEQiY/Y7eBCYPRv4\n", "gz8AHA47Z5haLl4EbhHPbaVrXrEY8M47D/68j2JesRjwn/85fs+fSpl1dwOp3D0TJgAlJeM9iwfn\n", "kSwex13+i+3oGP2T6fLyhrB163jPIjXk5Q3hn/95vGeRGpSFkZc3hF/8Yrxn8cWKi4F0ejfhkSye\n", "/Px8XLhwwdq+cOECXC7XTfu43W68/34K/SfVOOrpAYBb34CRaZSFoSyMVM/if/7HvitEt9v90J/D\n", "QT569359+umnKCoqwqFDh/D1r38dfr8fLS0tePLJJ8d7aiIicgeP5BVPdnY2/umf/gnBYBDXrl3D\n", "ihUrVDoiIo+IR/KKR0REHl2P5O3Ud3I3Hy59FDz33HPIy8tDaWmpNdbX14eamhr4fD4EAoGbPvgW\n", "CoXg9XpRXFyM9vZ2a/zkyZMoLS2F1+vFmjVrrPGrV6+irq4OXq8Xc+bMwfnz563Hmpub4fP54PP5\n", "sGPHjof8Su/swoULqKqqwsyZM1FSUoLXXnsNQGbmMTQ0hMrKSpSXl2PGjBl48cUXAWRmFtddu3YN\n", "FRUVeOaZZwBkbhYFBQV46qmnUFFRAb/fDyBFs2Ca+fTTT+l2uxmNRplMJllWVsazZ8+O97Tuydtv\n", "v813332XJSUl1tjatWu5adMmkuTGjRu5bt06kuSZM2dYVlbGZDLJaDRKt9vNkZERkuTs2bMZiURI\n", "kvPmzWNraytJcvPmzWxqaiJJhsNh1tXVkSR7e3tZWFjIRCLBRCJh/X08xeNxnjp1iiQ5MDBAn8/H\n", "s2fPZmweV65cIUkODw+zsrKSR44cydgsSPKVV17hX/7lX/KZZ54hmbnfJwUFBezt7b1pLBWzSLvi\n", "OXr0KIPBoLUdCoUYCoXGcUb3JxqN3lQ8RUVFvHTpEsnRH8ZFRUUkyR//+MfcuHGjtV8wGOSxY8fY\n", "3d3N4uJia7ylpYWrVq2y9jl+/DjJ0R9gubm5JMldu3axsbHROmbVqlVsaWl5SK/w3ixYsIAdHR0Z\n", "n8eVK1c4a9Ys/vd//3fGZnHhwgVWV1fz17/+Nf/iL/6CZOZ+nxQUFPDjjz++aSwVs0i7pbZbfbg0\n", "FouN44werJ6eHuTl5QEA8vLy0DN6Hyi6u7tvuqX8+uv+/Hh+fr6Vx41ZZWdnY9KkSejt7f3Cc6WK\n", "Dz74AKdOnUJlZWXG5jEyMoLy8nLk5eVZS5CZmsX3vvc9/PSnP0VWlvlxlqlZOBwOfOMb38CsWbPw\n", "i88+mJSKWTySd7Xdzt1+uDQdOByOjHq9ADA4OIhFixbh1VdfxVe/+tWbHsukPLKysnD69GlcvnwZ\n", "wWAQhw8fvunxTMni3/7t3/C1r30NFRUV6OzsvOU+mZIFAPzmN7/BtGnT8NFHH6GmpgbFxcU3PZ4q\n", "WaTdFc/dfLj0UZaXl4dLly4BAOLxOL72ta8BGPu6L168CJfLhfz8fFy8eHHM+PVjPvzwQwCjn426\n", "fPkypkyZkrIZDg8PY9GiRVi2bBkWLlwIILPzAIBJkybh29/+Nk6ePJmRWRw9ehQHDhzAE088gaVL\n", "l+LXv/41li1blpFZAMC0adMAAH/0R3+E73znO3jnnXdSM4v7WU9MRcPDwywsLGQ0GuXVq1cf6ZsL\n", "yLHv8axdu9Zalw2FQmPeKLx69SrPnTvHwsJC641Cv9/P48ePc2RkZMwbhdfXZVtaWm56o/CJJ55g\n", "IpFgX1+f9ffxNDIywmXLlvH555+/aTwT8/joo4+s5//973/Pp59+mv/+7/+ekVncqLOz03qPJxOz\n", "uHLlCv/v//6PJDk4OMg//dM/5cGDB1Myi7QrHpL81a9+RZ/PR7fbzR//+MfjPZ17Vl9fz2nTpjEn\n", "J4cul4vbtm1jb28vq6ur6fV6WVNTc9MXd8OGDXS73SwqKmJbW5s1fuLECZaUlNDtdnP16tXW+NDQ\n", "EGtra+nxeFhZWcloNGo9tm3bNno8Hno8Hm7fvt2W13s7R44cocPhYFlZGcvLy1leXs7W1taMzOO3\n", "v/0tKyoqWFZWxtLSUv7kJz8hyYzM4kadnZ3WXW2ZmMW5c+dYVlbGsrIyzpw50/rZl4pZ6AOkIiJi\n", "q7R7j0dERFKbikdERGyl4hEREVupeERExFYqHhERsZWKR0REbKXiERERW6l4RETEVv8P2jD4NZfK\n", "31IAAAAASUVORK5CYII=\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fd.plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "cfd = nltk.ConditionalFreqDist(\n", "(genre, word)\n", "for genre in ['gpo', 'artstor']\n", "for word in reader.words(categories=genre))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.3.5" } }, "nbformat": 4, "nbformat_minor": 0 }