{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Pipelining becomes powerful with GridSearchCV\n", "-----------------------------------------------" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.svm import LinearSVC\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.grid_search import GridSearchCV\n", "import numpy as np" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.datasets import load_iris\n", "from sklearn.cross_validation import train_test_split\n", "\n", "\n", "iris = load_iris()\n", "X, y = iris.data, iris.target\n", "X_train, X_test, y_train, y_test = train_test_split(X, y)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "markdown", "metadata": {}, "source": [ "The wrong way to do GridSearchCV with preprocessing:" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.preprocessing import StandardScaler\n", "from sklearn.svm import SVC\n", "\n", "scaler = StandardScaler()\n", "X_preprocessed = scaler.fit_transform(X_train)\n", "param_grid = {'C': 10. ** np.arange(-3, 3), 'gamma': 10. ** np.arange(-3, 3)}\n", "\n", "grid = GridSearchCV(SVC(), param_grid=param_grid, cv=5)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "markdown", "metadata": {}, "source": [ "The right way to do GridSearchCV with preprocessing" ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.pipeline import make_pipeline\n", "\n", "param_grid_pipeline = {'svc__C': 10. ** np.arange(-3, 3), 'svc__gamma': 10. ** np.arange(-3, 3)}\n", "\n", "scaler_pipe = make_pipeline(StandardScaler(), SVC())\n", "grid = GridSearchCV(scaler_pipe, param_grid=param_grid_pipeline, cv=5)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "grid.fit(X_train, y_train)\n", "print(grid.best_params_)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "{'svc__gamma': 0.10000000000000001, 'svc__C': 10.0}\n" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.pipeline import make_pipeline\n", "from sklearn.svm import SVC\n", "from sklearn.feature_selection import SelectKBest\n", "\n", "\n", "param_grid = {'selectkbest__k': [1, 2, 3, 4], 'svc__C': 10. ** np.arange(-3, 3), 'svc__gamma': 10. ** np.arange(-3, 3)}\n", "\n", "scaler_pipe = make_pipeline(SelectKBest(), SVC())\n", "grid = GridSearchCV(scaler_pipe, param_grid=param_grid, cv=5)\n", "grid.fit(X_train, y_train)\n", "print(grid.best_params_)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "{'svc__gamma': 0.10000000000000001, 'selectkbest__k': 4, 'svc__C': 10.0}\n" ] } ], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "text_pipe = make_pipeline(TfidfVectorizer(), LinearSVC())\n", "param_grid = {'tfidifvectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)], 'linearsvc__C': 10. ** np.arange(-3, 3)}\n", "\n", "grid = GridSearchCV(text_pipe, param_grid=param_grid, cv=5)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 } ], "metadata": {} } ] }