{ "cells": [ { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics import f1_score\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.svm import LinearSVC\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.datasets import fetch_20newsgroups\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.base import TransformerMixin,BaseEstimator\n", "from sklearn.decomposition import PCA\n", "\n", "cats = ['alt.atheism', 'sci.space']\n", "newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)\n", "newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": true }, "outputs": [], "source": [ "X_train = newsgroups_train.data\n", "X_test = newsgroups_test.data\n", "y_train = newsgroups_train.target\n", "y_test = newsgroups_test.target" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": true }, "outputs": [], "source": [ "class DenseTransformer(BaseEstimator,TransformerMixin):\n", "\n", " def transform(self, X, y=None, **fit_params):\n", " return X.todense()\n", "\n", " def fit(self, X, y=None, **fit_params):\n", " return self" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "GridSearchCV(cv=3, error_score='raise',\n", " estimator=Pipeline(memory=None,\n", " steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n", " dtype=, encoding='utf-8', input='content',\n", " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", " ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,\n", " ... min_weight_fraction_leaf=0.0, presort=False, random_state=None,\n", " splitter='best'))]),\n", " fit_params=None, iid=True, n_jobs=1,\n", " param_grid=[{'tfidf__max_df': [0.8, 0.9, 1.0]}],\n", " pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n", " scoring='f1_micro', verbose=0)" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline = Pipeline([\n", " ('tfidf',TfidfVectorizer()),\n", " ('to_dense',DenseTransformer()),\n", " ('pca',PCA()),\n", " ('clf',DecisionTreeClassifier())\n", "])\n", "\n", "# this is where you define the values for\n", "# GridSearchCV to iterate over\n", "\n", "# l1 penalty is incompatible with other configs\n", "param_grid = [\n", " {\n", " 'tfidf__max_df':[0.8,0.9,1.0]\n", " }\n", "]\n", "\n", "# do 3-fold cross validation for each of the 6 possible\n", "# combinations of the parameter values above\n", "grid = GridSearchCV(pipeline, cv=3, param_grid=param_grid,scoring='f1_micro')\n", "grid.fit(X_train,y_train)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Best: 0.946878 using {'tfidf__max_df': 0.9}\n", "0.917987 (0.051901) with: {'tfidf__max_df': 0.8}\n", "0.946878 (0.003989) with: {'tfidf__max_df': 0.9}\n", "0.945946 (0.009163) with: {'tfidf__max_df': 1.0}\n" ] } ], "source": [ "# summarize results\n", "print(\"Best: %f using %s\" % (grid.best_score_, \n", " grid.best_params_))\n", "means = grid.cv_results_['mean_test_score']\n", "stds = grid.cv_results_['std_test_score']\n", "params = grid.cv_results_['params']\n", "for mean, stdev, param in zip(means, stds, params):\n", " print(\"%f (%f) with: %r\" % (mean, stdev, param))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# now train and predict test instances\n", "# using the best configs\n", "pipeline.set_params(clf__penalty='l2',vect__max_df=0.9,clf__dual=True)\n", "pipeline.fit(X_train,y_train)\n", "y_preds = pipeline.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.97615708274894808" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# calculate f1\n", "f1_score(y_test, y_preds, average='micro')" ] } ], "metadata": { "kernelspec": { "display_name": "Global TF Kernel (Python 3)", "language": "python", "name": "global-tf-python-3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 2 }