{ "metadata": { "name": "", "signature": "sha256:6a31859521d79339646e541eb5b57c38ebb33ff1b2110c1472ebb028bf97d337" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Illustrations:\n", "\n", "* Accuracy compared with features used\n", "* Format and benefits of a 'corpus'\n" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import nltk.classify.util\n", "from nltk.classify import NaiveBayesClassifier\n", "from nltk.corpus import movie_reviews\n", " " ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "print(movie_reviews.readme())" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "def word_feats(words):\n", " return dict([(word, True) for word in words])\n", " \n", "negids = movie_reviews.fileids('neg')\n", "posids = movie_reviews.fileids('pos')\n", " \n", "negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]\n", "posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]\n", " \n", "negcutoff = len(negfeats)*1/16\n", "poscutoff = len(posfeats)*1/16\n", " " ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "negcutoff" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]\n", "testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]\n", "print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))\n", " \n", "classifier = NaiveBayesClassifier.train(trainfeats)\n", "print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)\n", "classifier.show_most_informative_features()" ], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }