{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics import f1_score\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.datasets import fetch_20newsgroups\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.base import TransformerMixin,BaseEstimator\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "cats = ['alt.atheism', 'sci.space']\n",
    "newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)\n",
    "newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train = newsgroups_train.data\n",
    "X_test = newsgroups_test.data\n",
    "y_train = newsgroups_train.target\n",
    "y_test = newsgroups_test.target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class DenseTransformer(BaseEstimator,TransformerMixin):\n",
    "\n",
    "    def transform(self, X, y=None, **fit_params):\n",
    "        return X.todense()\n",
    "\n",
    "    def fit(self, X, y=None, **fit_params):\n",
    "        return self"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=3, error_score='raise',\n",
       "       estimator=Pipeline(memory=None,\n",
       "     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
       "        lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
       "        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,\n",
       " ...      min_weight_fraction_leaf=0.0, presort=False, random_state=None,\n",
       "            splitter='best'))]),\n",
       "       fit_params=None, iid=True, n_jobs=1,\n",
       "       param_grid=[{'tfidf__max_df': [0.8, 0.9, 1.0]}],\n",
       "       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n",
       "       scoring='f1_micro', verbose=0)"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipeline = Pipeline([\n",
    "    ('tfidf',TfidfVectorizer()),\n",
    "    ('to_dense',DenseTransformer()),\n",
    "    ('pca',PCA()),\n",
    "    ('clf',DecisionTreeClassifier())\n",
    "])\n",
    "\n",
    "# this is where you define the values for\n",
    "# GridSearchCV to iterate over\n",
    "\n",
    "# l1 penalty is incompatible with other configs\n",
    "param_grid = [\n",
    "    {\n",
    "        'tfidf__max_df':[0.8,0.9,1.0]\n",
    "    }\n",
    "]\n",
    "\n",
    "# do 3-fold cross validation for each of the 6 possible\n",
    "# combinations of the parameter values above\n",
    "grid = GridSearchCV(pipeline, cv=3, param_grid=param_grid,scoring='f1_micro')\n",
    "grid.fit(X_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best: 0.946878 using {'tfidf__max_df': 0.9}\n",
      "0.917987 (0.051901) with: {'tfidf__max_df': 0.8}\n",
      "0.946878 (0.003989) with: {'tfidf__max_df': 0.9}\n",
      "0.945946 (0.009163) with: {'tfidf__max_df': 1.0}\n"
     ]
    }
   ],
   "source": [
    "# summarize results\n",
    "print(\"Best: %f using %s\" % (grid.best_score_, \n",
    "    grid.best_params_))\n",
    "means = grid.cv_results_['mean_test_score']\n",
    "stds = grid.cv_results_['std_test_score']\n",
    "params = grid.cv_results_['params']\n",
    "for mean, stdev, param in zip(means, stds, params):\n",
    "    print(\"%f (%f) with: %r\" % (mean, stdev, param))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# now train and predict test instances\n",
    "# using the best configs\n",
    "pipeline.set_params(clf__penalty='l2',vect__max_df=0.9,clf__dual=True)\n",
    "pipeline.fit(X_train,y_train)\n",
    "y_preds = pipeline.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.97615708274894808"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# calculate f1\n",
    "f1_score(y_test, y_preds, average='micro')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Global TF Kernel (Python 3)",
   "language": "python",
   "name": "global-tf-python-3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}