{ "metadata": { "name": "Text Classification" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "%pylab inline\n", "import numpy as np\n", "import pylab as pl" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.kernel.zmq.pylab.backend_inline].\n", "For more information, type 'help(pylab)'.\n" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.datasets import fetch_20newsgroups\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.cluster import MiniBatchKMeans" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "twenty_train = fetch_20newsgroups(subset='train')\n", "vec = TfidfVectorizer(max_df=0.5)\n", "X_train = vec.fit_transform(twenty_train.data)\n", "y_train = twenty_train.target" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "kmeans = MiniBatchKMeans(n_clusters=1000, batch_size=1000, reassignment_ratio=0,\n", " n_init=1, max_iter=10, compute_labels=False, verbose=10)\n", "%time kmeans.fit(X_train)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Init 1/1 with method: k-means++\n", "Inertia for init 1/1: 1692.760402" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 1/120:mean batch inertia: 0.860088, ewa inertia: 0.860088 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 2/120:mean batch inertia: 0.814616, ewa inertia: 0.852051 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 3/120:mean batch inertia: 0.818794, ewa inertia: 0.846173 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 4/120:mean batch inertia: 0.802534, ewa inertia: 0.838459 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 5/120:mean batch inertia: 0.791585, ewa inertia: 0.830174 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 6/120:mean batch inertia: 0.805043, ewa inertia: 0.825732 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 7/120:mean batch inertia: 0.806815, ewa inertia: 0.822388 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 8/120:mean batch inertia: 0.778904, ewa inertia: 0.814702 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 9/120:mean batch inertia: 0.794126, ewa inertia: 0.811065 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 10/120:mean batch inertia: 0.796617, ewa inertia: 0.808511 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 11/120:mean batch inertia: 0.796310, ewa inertia: 0.806355 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 12/120:mean batch inertia: 0.786515, ewa inertia: 0.802848 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 13/120:mean batch inertia: 0.798948, ewa inertia: 0.802159 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 14/120:mean batch inertia: 0.788051, ewa inertia: 0.799665 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 15/120:mean batch inertia: 0.799689, ewa inertia: 0.799669 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 16/120:mean batch inertia: 0.795038, ewa inertia: 0.798851 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 17/120:mean batch inertia: 0.791957, ewa inertia: 0.797632 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 18/120:mean batch inertia: 0.780278, ewa inertia: 0.794565 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 19/120:mean batch inertia: 0.776511, ewa inertia: 0.791374 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 20/120:mean batch inertia: 0.783024, ewa inertia: 0.789898 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 21/120:mean batch inertia: 0.791340, ewa inertia: 0.790153 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 22/120:mean batch inertia: 0.775762, ewa inertia: 0.787609 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 23/120:mean batch inertia: 0.784420, ewa inertia: 0.787046 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 24/120:mean batch inertia: 0.782805, ewa inertia: 0.786296 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 25/120:mean batch inertia: 0.794891, ewa inertia: 0.787815 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 26/120:mean batch inertia: 0.786445, ewa inertia: 0.787573 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 27/120:mean batch inertia: 0.786700, ewa inertia: 0.787419 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 28/120:mean batch inertia: 0.768948, ewa inertia: 0.784154 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 29/120:mean batch inertia: 0.782136, ewa inertia: 0.783797 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 30/120:mean batch inertia: 0.783097, ewa inertia: 0.783673 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 31/120:mean batch inertia: 0.766241, ewa inertia: 0.780592 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 32/120:mean batch inertia: 0.786856, ewa inertia: 0.781699 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 33/120:mean batch inertia: 0.779011, ewa inertia: 0.781224 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 34/120:mean batch inertia: 0.766678, ewa inertia: 0.778653 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 35/120:mean batch inertia: 0.766586, ewa inertia: 0.776520 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 36/120:mean batch inertia: 0.774855, ewa inertia: 0.776226 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 37/120:mean batch inertia: 0.777257, ewa inertia: 0.776408 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 38/120:mean batch inertia: 0.778189, ewa inertia: 0.776723 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 39/120:mean batch inertia: 0.775110, ewa inertia: 0.776438 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 40/120:mean batch inertia: 0.769045, ewa inertia: 0.775131 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 41/120:mean batch inertia: 0.781836, ewa inertia: 0.776316 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 42/120:mean batch inertia: 0.781991, ewa inertia: 0.777319 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 43/120:mean batch inertia: 0.766242, ewa inertia: 0.775361 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 44/120:mean batch inertia: 0.782652, ewa inertia: 0.776650 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 45/120:mean batch inertia: 0.769341, ewa inertia: 0.775358 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 46/120:mean batch inertia: 0.769314, ewa inertia: 0.774290 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 47/120:mean batch inertia: 0.783279, ewa inertia: 0.775879 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 48/120:mean batch inertia: 0.765790, ewa inertia: 0.774095 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 49/120:mean batch inertia: 0.772857, ewa inertia: 0.773877 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 50/120:mean batch inertia: 0.766756, ewa inertia: 0.772618 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 51/120:mean batch inertia: 0.781116, ewa inertia: 0.774120 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 52/120:mean batch inertia: 0.781916, ewa inertia: 0.775498 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 53/120:mean batch inertia: 0.778335, ewa inertia: 0.775999 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 54/120:mean batch inertia: 0.786696, ewa inertia: 0.777890 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 55/120:mean batch inertia: 0.780289, ewa inertia: 0.778314 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 56/120:mean batch inertia: 0.753558, ewa inertia: 0.773938 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 57/120:mean batch inertia: 0.772628, ewa inertia: 0.773707 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 58/120:mean batch inertia: 0.761110, ewa inertia: 0.771480 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 59/120:mean batch inertia: 0.757480, ewa inertia: 0.769006 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 60/120:mean batch inertia: 0.757817, ewa inertia: 0.767028 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 61/120:mean batch inertia: 0.760001, ewa inertia: 0.765786 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 62/120:mean batch inertia: 0.764606, ewa inertia: 0.765577 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 63/120:mean batch inertia: 0.769783, ewa inertia: 0.766321 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 64/120:mean batch inertia: 0.781178, ewa inertia: 0.768947 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 65/120:mean batch inertia: 0.766600, ewa inertia: 0.768532 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 66/120:mean batch inertia: 0.775439, ewa inertia: 0.769753 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 67/120:mean batch inertia: 0.770475, ewa inertia: 0.769880 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 68/120:mean batch inertia: 0.762985, ewa inertia: 0.768662 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 69/120:mean batch inertia: 0.780256, ewa inertia: 0.770711 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 70/120:mean batch inertia: 0.764802, ewa inertia: 0.769667 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 71/120:mean batch inertia: 0.767075, ewa inertia: 0.769208 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Minibatch iteration 72/120:mean batch inertia: 0.778244, ewa inertia: 0.770806 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "Converged (lack of improvement in inertia) at iteration 72/120\n", "CPU times: user 2min 39s, sys: 4.9 s, total: 2min 44s\n", "Wall time: 2min 44s\n" ] }, { "metadata": {}, "output_type": "pyout", "prompt_number": 12, "text": [ "MiniBatchKMeans(batch_size=1000, compute_labels=False, init='k-means++',\n", " init_size=None, max_iter=10, max_no_improvement=10,\n", " n_clusters=1000, n_init=1, random_state=None, reassignment_ratio=0,\n", " tol=0.0, verbose=10)" ] } ], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "X_proj_train = np.dot(X_train, kmeans.cluster_centers_.T)" ], "language": "python", "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "setting an array element with a sequence.", "output_type": "pyerr", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mX_proj_train\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkmeans\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcluster_centers_\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mValueError\u001b[0m: setting an array element with a sequence." ] } ], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.metrics.pairwise import safe_sparse_dot" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "X_train_proj = safe_sparse_dot(X_train, kmeans.cluster_centers_.T)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [ "X_train_proj.shape" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 21, "text": [ "(11314, 1000)" ] } ], "prompt_number": 21 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.ensemble import ExtraTreesClassifier" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [ "%time clf = ExtraTreesClassifier(n_estimators=100, max_depth=100, n_jobs=-1, oob_score=True, bootstrap=True).fit(X_train_proj, y_train)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 13.3 s, sys: 2.13 s, total: 15.5 s\n", "Wall time: 2min 5s\n" ] } ], "prompt_number": 32 }, { "cell_type": "code", "collapsed": false, "input": [ "clf.oob_score_" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 33, "text": [ "0.67668375464026864" ] } ], "prompt_number": 33 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.linear_model import PassiveAggressiveClassifier\n", "from sklearn.grid_search import GridSearchCV" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 28 }, { "cell_type": "code", "collapsed": false, "input": [ "param_grid = {'C': [0.01, 0.1, 1, 10, 100]}\n", "gs = GridSearchCV(PassiveAggressiveClassifier(), param_grid, cv=3, n_jobs=-1)\n", "%time gs.fit(X_train_proj, y_train)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "CPU times: user 4.03 s, sys: 2.87 s, total: 6.9 s\n", "Wall time: 17.7 s\n" ] }, { "metadata": {}, "output_type": "pyout", "prompt_number": 29, "text": [ "GridSearchCV(cv=3,\n", " estimator=PassiveAggressiveClassifier(C=1.0, fit_intercept=True, loss='hinge', n_iter=5,\n", " n_jobs=1, random_state=None, shuffle=False, verbose=0,\n", " warm_start=False),\n", " fit_params={}, iid=True, loss_func=None, n_jobs=-1,\n", " param_grid={'C': [0.01, 0.1, 1, 10, 100]}, pre_dispatch='2*n_jobs',\n", " refit=True, score_func=None, scoring=None, verbose=0)" ] } ], "prompt_number": 29 }, { "cell_type": "code", "collapsed": false, "input": [ "gs.best_params_" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 30, "text": [ "{'C': 1}" ] } ], "prompt_number": 30 }, { "cell_type": "code", "collapsed": false, "input": [ "gs.best_score_" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 31, "text": [ "0.83206646632490722" ] } ], "prompt_number": 31 } ], "metadata": {} } ] }