{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# import\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.linear_model.logistic import LogisticRegression\n",
    "from sklearn.grid_search import GridSearchCV\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.model_selection import train_test_split, cross_val_score\n",
    "from sklearn.metrics import  confusion_matrix, accuracy_score, classification_report, precision_score\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Lets revise"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.33</td>\n",
       "      <td>0.69</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0</td>\n",
       "      <td>0.88</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.33</td>\n",
       "      <td>0.94</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.8</td>\n",
       "      <td>1</td>\n",
       "      <td>0.31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.33</td>\n",
       "      <td>0.50</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>-1</td>\n",
       "      <td>0.50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.33</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>-1</td>\n",
       "      <td>0.38</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      0     1  2  3  4  5    6  7     8\n",
       "0 -0.33  0.69  0  1  1  0  0.8  0  0.88\n",
       "1 -0.33  0.94  1  0  1  0  0.8  1  0.31\n",
       "2 -0.33  0.50  1  0  0  0  1.0 -1  0.50\n",
       "3 -0.33  0.75  0  1  1  0  1.0 -1  0.38"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# reading the data\n",
    "df = pd.read_csv(\"data/fertility_Diagnosis.txt\", delimiter=',', header=None)\n",
    "df.iloc[:4,0:9]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:9], df[9], test_size=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 5 folds for each of 8 candidates, totalling 40 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=5)]: Done  31 out of  40 | elapsed:    5.8s remaining:    1.6s\n",
      "[Parallel(n_jobs=5)]: Done  40 out of  40 | elapsed:    5.8s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, error_score='raise',\n",
       "       estimator=Pipeline(steps=[('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
       "          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n",
       "          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n",
       "          verbose=0, warm_start=False))]),\n",
       "       fit_params={}, iid=True, n_jobs=5,\n",
       "       param_grid={'clf__C': (0.01, 0.1, 1, 10), 'clf__penalty': ('l1', 'l2')},\n",
       "       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy',\n",
       "       verbose=True)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipeline = Pipeline([('clf', LogisticRegression())])\n",
    "\n",
    "parameters = {\n",
    " 'clf__penalty': ('l1', 'l2'),\n",
    "    'clf__C': (0.01, 0.1, 1, 10)    \n",
    "    }\n",
    "\n",
    "grid_search = GridSearchCV(pipeline, parameters, n_jobs=5, verbose=True, scoring='accuracy', cv = 5)\n",
    "grid_search.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best score: 0.933\n",
      "Best parameters set:\n",
      "\tclf__C: 0.01\n",
      "\tclf__penalty: 'l1'\n"
     ]
    }
   ],
   "source": [
    "print( 'Best score: %0.3f' % grid_search.best_score_)\n",
    "print( 'Best parameters set:')\n",
    "\n",
    "best_parameters = grid_search.best_estimator_.get_params()\n",
    "\n",
    "for param_name in sorted(parameters.keys()):\n",
    "    print( '\\t%s: %r' % (param_name, best_parameters[param_name]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.8\n",
      "Precision: 0.0\n",
      "Recall: 0.0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    }
   ],
   "source": [
    "y_pred = grid_search.predict(X_test)\n",
    "\n",
    "#print((y_pred), (y_test))\n",
    "\n",
    "y_test = [2 if x=='N' else 1 for x in y_test]\n",
    "y_pred = [2 if x=='N' else 1 for x in y_pred]\n",
    "\n",
    "#print((y_pred), (y_test))\n",
    "\n",
    "print( 'Accuracy:', accuracy_score(y_test, y_pred))\n",
    "print( 'Precision:', precision_score(y_test, y_pred))\n",
    "print( 'Recall:', recall_score(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Multi-class classification  \n",
    "\n",
    "The goal of multi-class classification\n",
    "is to assign an instance to one of the set of classes. scikit-learn uses a strategy\n",
    "called one-vs.-all, or one-vs.-the-rest, to support multi-class classification. Onevs.-\n",
    "all classification uses one binary classifier for each of the possible classes. The\n",
    "class that is predicted with the greatest confidence is assigned to the instance."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PhraseId</th>\n",
       "      <th>SentenceId</th>\n",
       "      <th>Phrase</th>\n",
       "      <th>Sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>A series</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>A</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>series</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>of escapades demonstrating the adage that what...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>of</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "      <td>escapades demonstrating the adage that what is...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>escapades</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>demonstrating the adage that what is good for ...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PhraseId  SentenceId                                             Phrase  \\\n",
       "0         1           1  A series of escapades demonstrating the adage ...   \n",
       "1         2           1  A series of escapades demonstrating the adage ...   \n",
       "2         3           1                                           A series   \n",
       "3         4           1                                                  A   \n",
       "4         5           1                                             series   \n",
       "5         6           1  of escapades demonstrating the adage that what...   \n",
       "6         7           1                                                 of   \n",
       "7         8           1  escapades demonstrating the adage that what is...   \n",
       "8         9           1                                          escapades   \n",
       "9        10           1  demonstrating the adage that what is good for ...   \n",
       "\n",
       "   Sentiment  \n",
       "0          1  \n",
       "1          2  \n",
       "2          2  \n",
       "3          2  \n",
       "4          2  \n",
       "5          2  \n",
       "6          2  \n",
       "7          2  \n",
       "8          2  \n",
       "9          2  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movie = pd.read_csv(\"data/movie_train.tsv\", delimiter=\"\\t\")\n",
    "movie[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "count    156060.000000\n",
      "mean          2.063578\n",
      "std           0.893832\n",
      "min           0.000000\n",
      "25%           2.000000\n",
      "50%           2.000000\n",
      "75%           3.000000\n",
      "max           4.000000\n",
      "Name: Sentiment, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "print(movie['Sentiment'].describe())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2    79582\n",
      "3    32927\n",
      "1    27273\n",
      "4     9206\n",
      "0     7072\n",
      "Name: Sentiment, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print(movie['Sentiment'].value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 3 folds for each of 24 candidates, totalling 72 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  2.5min\n",
      "[Parallel(n_jobs=2)]: Done  72 out of  72 | elapsed:  5.7min finished\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best score: 0.631\n",
      "Best parameters set:\n",
      "\tclf__C: 10\n",
      "\tvect__max_df: 0.25\n",
      "\tvect__ngram_range: (1, 2)\n",
      "\tvect__use_idf: False\n",
      "Accuracy: 0.651159810329\n",
      "Confusion Matrix: [[  740  1022   287    24     2]\n",
      " [  526  3854  3568   289    10]\n",
      " [  120  1869 19712  2057    82]\n",
      " [   10   248  3740  5096   780]\n",
      " [    4    11   248  1435  1084]]\n",
      "Classification Report:              precision    recall  f1-score   support\n",
      "\n",
      "          0       0.53      0.36      0.43      2075\n",
      "          1       0.55      0.47      0.51      8247\n",
      "          2       0.72      0.83      0.77     23840\n",
      "          3       0.57      0.52      0.54      9874\n",
      "          4       0.55      0.39      0.46      2782\n",
      "\n",
      "avg / total       0.64      0.65      0.64     46818\n",
      "\n"
     ]
    }
   ],
   "source": [
    "def movie_rank():\n",
    "    \n",
    "    pipeline = Pipeline([('vect', TfidfVectorizer(stop_words='english')),\n",
    "            ('clf', LogisticRegression()) \n",
    "                 ])\n",
    "    \n",
    "    parameters = {'vect__max_df': (0.25, 0.5),\n",
    "                    'vect__ngram_range': ((1, 1), (1, 2)),\n",
    "                    'vect__use_idf': (True, False),\n",
    "                    'clf__C': (0.1, 1, 10),}\n",
    "    \n",
    "    movie=pd.read_csv('data/movie_train.tsv', header=0, delimiter='\\t')\n",
    "    X, y = movie['Phrase'], movie['Sentiment'].as_matrix()\n",
    "    #print(X[:3])\n",
    "    #print(y[:3])\n",
    "\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 19)\n",
    "    #print(X_train[:3])\n",
    "    #print(y_train[:3])\n",
    "\n",
    "    grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, verbose=1, scoring='accuracy')\n",
    "    grid_search.fit(X_train, y_train)\n",
    "    \n",
    "   \n",
    "    print( 'Best score: %0.3f' % grid_search.best_score_)\n",
    "    print( 'Best parameters set:')\n",
    "    best_parameters = grid_search.best_estimator_.get_params()\n",
    "    for param_name in sorted(parameters.keys()):\n",
    "        print( '\\t%s: %r' % (param_name, best_parameters[param_name]))\n",
    "        \n",
    "    predictions = grid_search.predict(X_test)\n",
    "\n",
    "    print ('Accuracy:', accuracy_score(y_test, predictions))\n",
    "    print ('Confusion Matrix:', confusion_matrix(y_test, predictions))\n",
    "    print ('Classification Report:', classification_report(y_test, predictions))\n",
    "\n",
    "movie_rank()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}