{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "More process building tools!\n", "----------------------------" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Use multiple feature extractors (on the same data), concatenate results." ] }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.pipeline import make_union, make_pipeline\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.svm import LinearSVC\n", "from sklearn.grid_search import GridSearchCV\n", "import numpy as np" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.datasets import fetch_20newsgroups" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "news = fetch_20newsgroups()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "data, y = news.data, news.target" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.cross_validation import train_test_split\n", "data_train, data_test, y_train, y_test = train_test_split(data, y)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "char_and_word = make_union(TfidfVectorizer(analyzer=\"char\"), TfidfVectorizer(analyzer=\"word\"))\n", "\n", "text_pipe = make_pipeline(char_and_word, LinearSVC(dual=False))\n", "param_grid = {'linearsvc__C': 10. ** np.arange(-3, 3)}\n", "\n", "grid = GridSearchCV(text_pipe, param_grid=param_grid, cv=5, verbose=10)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "grid.fit(data_train, y_train)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Fitting 5 folds for each of 6 candidates, totalling 30 fits\n", "[CV] linearsvc__C=0.001 ..............................................\n", "[CV] ..................... linearsvc__C=0.001, score=0.650645 - 15.3s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=0.001 ..............................................\n", "[CV] ..................... linearsvc__C=0.001, score=0.638660 - 15.5s" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "[Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 15.3s\n", "[Parallel(n_jobs=1)]: Done 2 jobs | elapsed: 30.8s\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=0.001 ..............................................\n", "[CV] ..................... linearsvc__C=0.001, score=0.643320 - 15.4s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=0.001 ..............................................\n", "[CV] ..................... linearsvc__C=0.001, score=0.623891 - 15.7s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=0.001 ..............................................\n", "[CV] ..................... linearsvc__C=0.001, score=0.627149 - 14.9s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=0.01 ...............................................\n", "[CV] ...................... linearsvc__C=0.01, score=0.811254 - 17.0s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=0.01 ...............................................\n", "[CV] ...................... linearsvc__C=0.01, score=0.801998 - 16.9s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=0.01 ...............................................\n", "[CV] ...................... linearsvc__C=0.01, score=0.805180 - 17.3s" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "[Parallel(n_jobs=1)]: Done 5 jobs | elapsed: 1.3min\n", "[Parallel(n_jobs=1)]: Done 8 jobs | elapsed: 2.1min\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=0.01 ...............................................\n", "[CV] ...................... linearsvc__C=0.01, score=0.781786 - 19.9s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=0.01 ...............................................\n", "[CV] ...................... linearsvc__C=0.01, score=0.802015 - 17.0s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=0.1 ................................................\n", "[CV] ....................... linearsvc__C=0.1, score=0.893318 - 20.8s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=0.1 ................................................\n", "[CV] ....................... linearsvc__C=0.1, score=0.901293 - 20.6s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=0.1 ................................................\n", "[CV] ....................... linearsvc__C=0.1, score=0.885815 - 19.6s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=0.1 ................................................\n", "[CV] ....................... linearsvc__C=0.1, score=0.884092 - 20.2s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=0.1 ................................................\n", "[CV] ....................... linearsvc__C=0.1, score=0.887374 - 21.2s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=1.0 ................................................\n", "[CV] ....................... linearsvc__C=1.0, score=0.915592 - 32.5s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=1.0 ................................................\n", "[CV] ....................... linearsvc__C=1.0, score=0.925969 - 24.1s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=1.0 ................................................\n", "[CV] ....................... linearsvc__C=1.0, score=0.916421 - 23.7s" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "[Parallel(n_jobs=1)]: Done 13 jobs | elapsed: 3.8min\n", "[Parallel(n_jobs=1)]: Done 18 jobs | elapsed: 5.8min\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=1.0 ................................................\n", "[CV] ....................... linearsvc__C=1.0, score=0.912478 - 22.4s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=1.0 ................................................\n", "[CV] ....................... linearsvc__C=1.0, score=0.911678 - 22.8s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=10.0 ...............................................\n", "[CV] ...................... linearsvc__C=10.0, score=0.913834 - 39.0s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=10.0 ...............................................\n", "[CV] ...................... linearsvc__C=10.0, score=0.918331 - 40.0s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=10.0 ...............................................\n", "[CV] ...................... linearsvc__C=10.0, score=0.908770 - 37.9s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=10.0 ...............................................\n", "[CV] ...................... linearsvc__C=10.0, score=0.912478 - 38.4s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=10.0 ...............................................\n", "[CV] ...................... linearsvc__C=10.0, score=0.912270 - 39.5s" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=100.0 ..............................................\n", "[CV] ..................... linearsvc__C=100.0, score=0.913247 - 1.3min" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=100.0 ..............................................\n", "[CV] ..................... linearsvc__C=100.0, score=0.917744 - 1.4min" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=100.0 ..............................................\n", "[CV] ..................... linearsvc__C=100.0, score=0.908770 - 1.3min" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=100.0 ..............................................\n", "[CV] ..................... linearsvc__C=100.0, score=0.910704 - 1.3min" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n", "[CV] linearsvc__C=100.0 ..............................................\n", "[CV] ..................... linearsvc__C=100.0, score=0.912863 - 1.4min" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "[Parallel(n_jobs=1)]: Done 25 jobs | elapsed: 9.8min\n", "[Parallel(n_jobs=1)]: Done 30 out of 30 | elapsed: 16.5min finished\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] }, { "metadata": {}, "output_type": "pyout", "prompt_number": 7, "text": [ "GridSearchCV(cv=5, error_score='raise',\n", " estimator=Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,\n", " transformer_list=[('tfidfvectorizer-1', TfidfVectorizer(analyzer='char', binary=False, decode_error=u'strict',\n", " dtype=, encoding=u'utf-8', input=u'content',\n", " lowercase=True, max_df=1.0, max_features=None, min_df=...2', max_iter=1000, multi_class='ovr',\n", " penalty='l2', random_state=None, tol=0.0001, verbose=0))]),\n", " fit_params={}, iid=True, loss_func=None, n_jobs=1,\n", " param_grid={'linearsvc__C': array([ 1.00000e-03, 1.00000e-02, 1.00000e-01, 1.00000e+00,\n", " 1.00000e+01, 1.00000e+02])},\n", " pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,\n", " verbose=10)" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "param_grid = {'featureunion__tfidfvectorizer-1__ngram_range': [(1, 3), (1, 5), (2, 5)],\n", " 'featureunion__tfidfvectorizer-2__ngram_range': [(1, 1), (1, 2), (2, 2)],\n", " 'linearsvc__C': 10. ** np.arange(-3, 3)}" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 8 } ], "metadata": {} } ] }