{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "# write out some toy data\n", "from sklearn.datasets import load_digits\n", "import cPickle\n", "\n", "digits = load_digits()\n", "\n", "X, y = digits.data, digits.target\n", "\n", "for i in range(10):\n", " cPickle.dump((X[i::10], y[i::10]), open(\"data/batch_%02d.pickle\" % i, \"w\"), -1)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.linear_model import SGDClassifier\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "sgd = SGDClassifier()\n", "\n", "for i in range(9):\n", " X_batch, y_batch = cPickle.load(open(\"data/batch_%02d.pickle\" % i))\n", " sgd.partial_fit(X_batch, y_batch, classes=range(10))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "X_test, y_test = cPickle.load(open(\"data/batch_09.pickle\"))\n", "\n", "sgd.score(X_test, y_test)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 4, "text": [ "0.88268156424581001" ] } ], "prompt_number": 4 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Text\n", "=====" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "from sklearn.feature_extraction.text import HashingVectorizer\n", "\n", "sgd = SGDClassifier()\n", "hashing_vectorizer = HashingVectorizer()\n", "\n", "for i in range(10):\n", " data_batch = pd.read_csv(\"data/train_%d.csv\" % i)\n", " text_batch = data_batch.Comment.tolist()\n", " y_batch = data_batch.Insult.values\n", " X_batch = hashing_vectorizer.transform(text_batch)\n", " sgd.partial_fit(X_batch, y_batch, classes=range(10))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "data_test = pd.read_csv(\"data/test_with_solutions.csv\")\n", "X_test = hashing_vectorizer.transform(data_test.Comment.tolist())\n", "y_test = data_test.Insult.values\n", "sgd.score(X_test, y_test)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 6, "text": [ "0.82546278806195694" ] } ], "prompt_number": 6 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Kernel Approximations\n", "=======================" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.kernel_approximation import RBFSampler\n", "\n", "sgd = SGDClassifier()\n", "kernel_approximation = RBFSampler(gamma=.001, n_components=400)\n", "\n", "for i in range(9):\n", " X_batch, y_batch = cPickle.load(open(\"data/batch_%02d.pickle\" % i))\n", " if i == 0:\n", " kernel_approximation.fit(X_batch)\n", " X_transformed = kernel_approximation.transform(X_batch)\n", " sgd.partial_fit(X_transformed, y_batch, classes=range(10))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "X_test, y_test = cPickle.load(open(\"data/batch_09.pickle\"))\n", "\n", "sgd.score(kernel_approximation.transform(X_test), y_test)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 8, "text": [ "0.94413407821229045" ] } ], "prompt_number": 8 } ], "metadata": {} } ] }