{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "More process building tools!\n",
      "----------------------------"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Use multiple feature extractors (on the same data), concatenate results."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.pipeline import make_union, make_pipeline\n",
      "from sklearn.feature_extraction.text import TfidfVectorizer\n",
      "from sklearn.svm import LinearSVC\n",
      "from sklearn.grid_search import GridSearchCV\n",
      "import numpy as np"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.datasets import fetch_20newsgroups"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "news = fetch_20newsgroups()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "data, y = news.data, news.target"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.cross_validation import train_test_split\n",
      "data_train, data_test, y_train, y_test = train_test_split(data, y)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "char_and_word = make_union(TfidfVectorizer(analyzer=\"char\"), TfidfVectorizer(analyzer=\"word\"))\n",
      "\n",
      "text_pipe = make_pipeline(char_and_word, LinearSVC(dual=False))\n",
      "param_grid = {'linearsvc__C': 10. ** np.arange(-3, 3)}\n",
      "\n",
      "grid = GridSearchCV(text_pipe, param_grid=param_grid, cv=5, verbose=10)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "grid.fit(data_train, y_train)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Fitting 5 folds for each of 6 candidates, totalling 30 fits\n",
        "[CV] linearsvc__C=0.001 ..............................................\n",
        "[CV] ..................... linearsvc__C=0.001, score=0.650645 -  15.3s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=0.001 ..............................................\n",
        "[CV] ..................... linearsvc__C=0.001, score=0.638660 -  15.5s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stderr",
       "text": [
        "[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:   15.3s\n",
        "[Parallel(n_jobs=1)]: Done   2 jobs       | elapsed:   30.8s\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=0.001 ..............................................\n",
        "[CV] ..................... linearsvc__C=0.001, score=0.643320 -  15.4s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=0.001 ..............................................\n",
        "[CV] ..................... linearsvc__C=0.001, score=0.623891 -  15.7s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=0.001 ..............................................\n",
        "[CV] ..................... linearsvc__C=0.001, score=0.627149 -  14.9s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=0.01 ...............................................\n",
        "[CV] ...................... linearsvc__C=0.01, score=0.811254 -  17.0s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=0.01 ...............................................\n",
        "[CV] ...................... linearsvc__C=0.01, score=0.801998 -  16.9s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=0.01 ...............................................\n",
        "[CV] ...................... linearsvc__C=0.01, score=0.805180 -  17.3s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stderr",
       "text": [
        "[Parallel(n_jobs=1)]: Done   5 jobs       | elapsed:  1.3min\n",
        "[Parallel(n_jobs=1)]: Done   8 jobs       | elapsed:  2.1min\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=0.01 ...............................................\n",
        "[CV] ...................... linearsvc__C=0.01, score=0.781786 -  19.9s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=0.01 ...............................................\n",
        "[CV] ...................... linearsvc__C=0.01, score=0.802015 -  17.0s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=0.1 ................................................\n",
        "[CV] ....................... linearsvc__C=0.1, score=0.893318 -  20.8s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=0.1 ................................................\n",
        "[CV] ....................... linearsvc__C=0.1, score=0.901293 -  20.6s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=0.1 ................................................\n",
        "[CV] ....................... linearsvc__C=0.1, score=0.885815 -  19.6s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=0.1 ................................................\n",
        "[CV] ....................... linearsvc__C=0.1, score=0.884092 -  20.2s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=0.1 ................................................\n",
        "[CV] ....................... linearsvc__C=0.1, score=0.887374 -  21.2s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=1.0 ................................................\n",
        "[CV] ....................... linearsvc__C=1.0, score=0.915592 -  32.5s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=1.0 ................................................\n",
        "[CV] ....................... linearsvc__C=1.0, score=0.925969 -  24.1s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=1.0 ................................................\n",
        "[CV] ....................... linearsvc__C=1.0, score=0.916421 -  23.7s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stderr",
       "text": [
        "[Parallel(n_jobs=1)]: Done  13 jobs       | elapsed:  3.8min\n",
        "[Parallel(n_jobs=1)]: Done  18 jobs       | elapsed:  5.8min\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=1.0 ................................................\n",
        "[CV] ....................... linearsvc__C=1.0, score=0.912478 -  22.4s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=1.0 ................................................\n",
        "[CV] ....................... linearsvc__C=1.0, score=0.911678 -  22.8s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=10.0 ...............................................\n",
        "[CV] ...................... linearsvc__C=10.0, score=0.913834 -  39.0s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=10.0 ...............................................\n",
        "[CV] ...................... linearsvc__C=10.0, score=0.918331 -  40.0s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=10.0 ...............................................\n",
        "[CV] ...................... linearsvc__C=10.0, score=0.908770 -  37.9s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=10.0 ...............................................\n",
        "[CV] ...................... linearsvc__C=10.0, score=0.912478 -  38.4s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=10.0 ...............................................\n",
        "[CV] ...................... linearsvc__C=10.0, score=0.912270 -  39.5s"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=100.0 ..............................................\n",
        "[CV] ..................... linearsvc__C=100.0, score=0.913247 - 1.3min"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=100.0 ..............................................\n",
        "[CV] ..................... linearsvc__C=100.0, score=0.917744 - 1.4min"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=100.0 ..............................................\n",
        "[CV] ..................... linearsvc__C=100.0, score=0.908770 - 1.3min"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=100.0 ..............................................\n",
        "[CV] ..................... linearsvc__C=100.0, score=0.910704 - 1.3min"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "[CV] linearsvc__C=100.0 ..............................................\n",
        "[CV] ..................... linearsvc__C=100.0, score=0.912863 - 1.4min"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stderr",
       "text": [
        "[Parallel(n_jobs=1)]: Done  25 jobs       | elapsed:  9.8min\n",
        "[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 16.5min finished\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n"
       ]
      },
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 7,
       "text": [
        "GridSearchCV(cv=5, error_score='raise',\n",
        "       estimator=Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,\n",
        "       transformer_list=[('tfidfvectorizer-1', TfidfVectorizer(analyzer='char', binary=False, decode_error=u'strict',\n",
        "        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',\n",
        "        lowercase=True, max_df=1.0, max_features=None, min_df=...2', max_iter=1000, multi_class='ovr',\n",
        "     penalty='l2', random_state=None, tol=0.0001, verbose=0))]),\n",
        "       fit_params={}, iid=True, loss_func=None, n_jobs=1,\n",
        "       param_grid={'linearsvc__C': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,\n",
        "         1.00000e+01,   1.00000e+02])},\n",
        "       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,\n",
        "       verbose=10)"
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "param_grid = {'featureunion__tfidfvectorizer-1__ngram_range': [(1, 3), (1, 5), (2, 5)],\n",
      "              'featureunion__tfidfvectorizer-2__ngram_range': [(1, 1), (1, 2), (2, 2)],\n",
      "              'linearsvc__C': 10. ** np.arange(-3, 3)}"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 8
    }
   ],
   "metadata": {}
  }
 ]
}