{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "training score: 0.992\n", "testing score: 0.49\n" ] } ], "source": [ "# Binary Classification\n", "\n", "from sklearn_vw import VWClassifier\n", "\n", "import numpy as np\n", "from sklearn import datasets\n", "from sklearn.cross_validation import train_test_split\n", "\n", "# get some data\n", "X, y = datasets.make_hastie_10_2(n_samples=10000, random_state=1)\n", "X = X.astype(np.float32)\n", "\n", "# split train and test set\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=256)\n", "\n", "# build vowpal wabbit model\n", "model = VWClassifier()\n", "model.fit(X_train, y_train)\n", "\n", "# evaluate\n", "print 'training score: {}'.format(model.score(X_train, y_train))\n", "print 'testing score: {}'.format(model.score(X_test, y_test))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "GridSearchCV took 31.25 seconds for 9 candidate parameter settings.\n", "Model with rank: 1\n", "Mean validation score: 0.503 (std: 0.000)\n", "Parameters: {'l2': 0.1, 'l': 0.01}\n", "\n", "Model with rank: 2\n", "Mean validation score: 0.502 (std: 0.003)\n", "Parameters: {'l2': 0.001, 'l': 0.1}\n", "\n", "Model with rank: 3\n", "Mean validation score: 0.501 (std: 0.003)\n", "Parameters: {'l2': None, 'l': 0.1}\n", "\n" ] } ], "source": [ "# Parameter Grid Search\n", "# http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html#example-model-selection-randomized-search-py\n", "\n", "from operator import itemgetter\n", "from time import time\n", "from sklearn.grid_search import RandomizedSearchCV\n", "from scipy.stats.distributions import uniform\n", "\n", "# Utility function to report best scores\n", "def report(grid_scores, n_top=3):\n", " top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]\n", " for i, score in enumerate(top_scores):\n", " print(\"Model with rank: {0}\".format(i + 1))\n", " print(\"Mean validation score: {0:.3f} (std: {1:.3f})\".format(\n", " score.mean_validation_score,\n", " np.std(score.cv_validation_scores)))\n", " print(\"Parameters: {0}\".format(score.parameters))\n", " print(\"\")\n", " \n", "# use a full grid over all parameters\n", "np.random.seed(0)\n", "n_iter = 20\n", "params = {\"l2\": uniform(0.0001, 0.01),\n", " \"l\": [0.01, 0.1, 1.0],\n", " \"power_t\": uniform()}\n", "\n", "# run search\n", "search = RandomizedSearchCV(VWClassifier(), param_distributions=params, n_iter=n_iter)\n", "start = time()\n", "search.fit(X, y)\n", "\n", "print(\"Parameter search took %.2f seconds for %d candidate parameter settings.\"\n", " % (time() - start, len(search.grid_scores_)))\n", "report(search.grid_scores_)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "training score: 0.594375\n", "testing score: 0.509\n" ] } ], "source": [ "# evaluate\n", "model = VWClassifier(loss_function='logistic', l=0.01, l2=0.1)\n", "model.fit(X_train, y_train)\n", "\n", "print 'training score: {}'.format(model.score(X_train, y_train))\n", "print 'testing score: {}'.format(model.score(X_test, y_test))\n", "\n", "# cleanup\n", "del model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "intercept: 145.317504883\n", "predictions: [ 191.21879578 168.42489624 186.91046143 193.12036133 172.59655762\n", " 138.17372131 172.12760925 113.33664703 153.11889648 199.30044556]\n", "training R2 score: 0.692918377646\n" ] } ], "source": [ "# Linear Regression\n", "\n", "from sklearn_vw import VWRegressor\n", "from sklearn import datasets\n", "\n", "# Load the diabetes dataset\n", "diabetes = datasets.load_diabetes()\n", "X = diabetes.data\n", "y = diabetes.target\n", "\n", "model = VWRegressor(l=100)\n", "model.fit(X, y)\n", "\n", "print 'intercept: {}'.format(model.get_intercept())\n", "print 'predictions: {}'.format(model.predict(X[:10]))\n", "print 'training R2 score: {}'.format(model.score(X, y))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2.0 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }