{
 "metadata": {
  "name": "",
  "signature": "sha256:6a31859521d79339646e541eb5b57c38ebb33ff1b2110c1472ebb028bf97d337"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Illustrations:\n",
      "\n",
      "* Accuracy compared with features used\n",
      "* Format and benefits of a 'corpus'\n"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import nltk.classify.util\n",
      "from nltk.classify import NaiveBayesClassifier\n",
      "from nltk.corpus import movie_reviews\n",
      " "
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print(movie_reviews.readme())"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def word_feats(words):\n",
      "    return dict([(word, True) for word in words])\n",
      " \n",
      "negids = movie_reviews.fileids('neg')\n",
      "posids = movie_reviews.fileids('pos')\n",
      " \n",
      "negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]\n",
      "posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]\n",
      " \n",
      "negcutoff = len(negfeats)*1/16\n",
      "poscutoff = len(posfeats)*1/16\n",
      " "
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "negcutoff"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]\n",
      "testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]\n",
      "print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))\n",
      " \n",
      "classifier = NaiveBayesClassifier.train(trainfeats)\n",
      "print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)\n",
      "classifier.show_most_informative_features()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}