{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Bagging trees on Titanic\n",
    "\n",
    "\n",
    "sklearn.tree.DecisionTreeClassifier\n",
    "\n",
    "* Creer les train et test sets\n",
    "* comme baseline: arbre de decision simple, not pruned, quel accuracy sur le test set ?\n",
    "* maintenant prendre 20 arbres, en limitant la taille a 2 niveaux\n",
    "* pour chaque arbre, predire les probas des echantillons du test set\n",
    "* puis moyenner les proba et utiliser le resultat pour determiner la classe predite.\n",
    "* quel accuracy sur le test set ?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import re\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.metrics import confusion_matrix, classification_report\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import roc_auc_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1309, 12)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pclass</th>\n",
       "      <th>survived</th>\n",
       "      <th>name</th>\n",
       "      <th>sex</th>\n",
       "      <th>age</th>\n",
       "      <th>sibsp</th>\n",
       "      <th>parch</th>\n",
       "      <th>ticket</th>\n",
       "      <th>fare</th>\n",
       "      <th>cabin</th>\n",
       "      <th>embarked</th>\n",
       "      <th>home.dest</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Allen, Miss. Elisabeth Walton</td>\n",
       "      <td>female</td>\n",
       "      <td>29.00</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>24160</td>\n",
       "      <td>211.3375</td>\n",
       "      <td>B5</td>\n",
       "      <td>S</td>\n",
       "      <td>St Louis, MO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Allison, Master. Hudson Trevor</td>\n",
       "      <td>male</td>\n",
       "      <td>0.92</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>113781</td>\n",
       "      <td>151.5500</td>\n",
       "      <td>C22 C26</td>\n",
       "      <td>S</td>\n",
       "      <td>Montreal, PQ / Chesterville, ON</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>Allison, Miss. Helen Loraine</td>\n",
       "      <td>female</td>\n",
       "      <td>2.00</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>113781</td>\n",
       "      <td>151.5500</td>\n",
       "      <td>C22 C26</td>\n",
       "      <td>S</td>\n",
       "      <td>Montreal, PQ / Chesterville, ON</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>Allison, Mr. Hudson Joshua Creighton</td>\n",
       "      <td>male</td>\n",
       "      <td>30.00</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>113781</td>\n",
       "      <td>151.5500</td>\n",
       "      <td>C22 C26</td>\n",
       "      <td>S</td>\n",
       "      <td>Montreal, PQ / Chesterville, ON</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>Allison, Mrs. Hudson J C (Bessie Waldo Daniels)</td>\n",
       "      <td>female</td>\n",
       "      <td>25.00</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>113781</td>\n",
       "      <td>151.5500</td>\n",
       "      <td>C22 C26</td>\n",
       "      <td>S</td>\n",
       "      <td>Montreal, PQ / Chesterville, ON</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pclass  survived                                             name     sex  \\\n",
       "0       1         1                    Allen, Miss. Elisabeth Walton  female   \n",
       "1       1         1                   Allison, Master. Hudson Trevor    male   \n",
       "2       1         0                     Allison, Miss. Helen Loraine  female   \n",
       "3       1         0             Allison, Mr. Hudson Joshua Creighton    male   \n",
       "4       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   \n",
       "\n",
       "     age  sibsp  parch  ticket      fare    cabin embarked  \\\n",
       "0  29.00      0      0   24160  211.3375       B5        S   \n",
       "1   0.92      1      2  113781  151.5500  C22 C26        S   \n",
       "2   2.00      1      2  113781  151.5500  C22 C26        S   \n",
       "3  30.00      1      2  113781  151.5500  C22 C26        S   \n",
       "4  25.00      1      2  113781  151.5500  C22 C26        S   \n",
       "\n",
       "                         home.dest  \n",
       "0                     St Louis, MO  \n",
       "1  Montreal, PQ / Chesterville, ON  \n",
       "2  Montreal, PQ / Chesterville, ON  \n",
       "3  Montreal, PQ / Chesterville, ON  \n",
       "4  Montreal, PQ / Chesterville, ON  "
      ]
     },
     "execution_count": 113,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv('../data/classification/titanic.csv', sep = ';')\n",
    "print(df.shape)\n",
    "df.sample(frac = 1)\n",
    "df.head()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Preprocessing\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Age\n",
    "df.loc[df.age.isnull(), 'age'] = np.mean(df.age)\n",
    "\n",
    "# Fare\n",
    "df.loc[df.fare.isnull(), 'fare'] = np.mean(df.fare)\n",
    "\n",
    "\n",
    "# Choix arbitraire\n",
    "df.loc[df.embarked.isnull(), 'embarked'] = 'C'\n",
    "\n",
    "# Choix arbitraire\n",
    "df.loc[df['home.dest'].isnull(), 'home.dest'] = 'Autre'\n",
    "\n",
    "\n",
    "# extract title\n",
    "df['title'] = df.name.apply(lambda x: re.search(' ([A-Z][a-z]+)\\.', x).group(1))\n",
    "\n",
    "for col in ['sex', 'embarked', 'home.dest', 'title']:\n",
    "    le = LabelEncoder()\n",
    "    df[col] = le.fit_transform(df[col])\n",
    "    \n",
    "\n",
    "# drop useless columns    \n",
    "df.drop(columns = ['name','cabin', 'ticket'], inplace = True)\n",
    "\n",
    "X = df.drop(columns = ['survived'], inplace = False)\n",
    "y = df.survived\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Baseline\n",
    "\n",
    "comme baseline: arbre de decision simple, not pruned, quel accuracy sur le test set \n",
    "On voit bien l'overfitting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      " -- ROC AUC Score test 0.7463\n",
      "\n",
      " -- ROC AUC Score train 0.9991\n"
     ]
    }
   ],
   "source": [
    "clf = DecisionTreeClassifier()\n",
    "clf.fit(X_train, y_train)\n",
    "\n",
    "\n",
    "\n",
    "yhat_proba_test  = clf.predict_proba(X_test)[:,1]\n",
    "yhat_proba_train = clf.predict_proba(X_train)[:,1]\n",
    "\n",
    "\n",
    "print(\"\\n -- ROC AUC Score test {:.4f}\".format(roc_auc_score(y_test, yhat_proba_test)))\n",
    "print(\"\\n -- ROC AUC Score train {:.4f}\".format(roc_auc_score(y_train, yhat_proba_train)))\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# N arbres max_depth = 2\n",
    "\n",
    "maintenant prendre 20 arbres, en limitant la taille a 2 niveaux\n",
    "\n",
    "A chaque iteration "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      " -- ROC AUC Score test Bagging 0.8608\n",
      "\n",
      " -- ROC AUC Score train 0.5000  0.0000  \n",
      "\n",
      " -- ROC AUC Score test 0.5000  0.0000 \n"
     ]
    }
   ],
   "source": [
    "N = 200\n",
    "\n",
    "yhat_proba = [0] * len(y_test)\n",
    "\n",
    "roc_auc_train, roc_auc_test = [], []\n",
    "\n",
    "for n in range(N):\n",
    "    idx = X_train.sample(frac = 0.2, replace = True).index\n",
    "\n",
    "    clf = DecisionTreeClassifier(min_samples_leaf = 100)\n",
    "\n",
    "    clf.fit(X_train.loc[idx], y_train[idx])\n",
    "    \n",
    "    yhat_proba_test += clf.predict_proba(X_test)[:,1]\n",
    "    \n",
    "    roc_auc_train.append(roc_auc_score(y_train, clf.predict_proba(X_train)[:,1] ))\n",
    "    roc_auc_test.append(roc_auc_score(y_test, clf.predict_proba(X_test)[:,1] ))\n",
    "\n",
    "yhat_proba_test = yhat_proba_test / N    \n",
    "\n",
    "\n",
    "print(\"\\n -- ROC AUC Score test Bagging {:.4f}\".format(roc_auc_score(y_test, yhat_proba_test)))\n",
    "print(\"\\n -- ROC AUC Score train {:.4f}  {:.4f}  \".format(np.mean(roc_auc_train), np.std(roc_auc_train)))\n",
    "print(\"\\n -- ROC AUC Score test {:.4f}  {:.4f} \".format(np.mean(roc_auc_test), np.std(roc_auc_test)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "fig, ax = plt.subplots(1,1)\n",
    "plt.boxplot(roc_auc_train);\n",
    "fig, ax = plt.subplots(1,1)\n",
    "plt.boxplot(roc_auc_test);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![](https://gph.is/2c69qZB)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "![](https://gph.is/2c69qZB)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Random Forest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
      "            max_depth=3, max_features='auto', max_leaf_nodes=None,\n",
      "            min_impurity_decrease=0.0, min_impurity_split=None,\n",
      "            min_samples_leaf=1, min_samples_split=2,\n",
      "            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,\n",
      "            oob_score=False, random_state=None, verbose=0,\n",
      "            warm_start=False)\n",
      "\n",
      " -- ROC AUC Score test 0.8725\n",
      "\n",
      " -- ROC AUC Score train 0.8504\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "clf = RandomForestClassifier(max_depth = 3,n_estimators=100, bootstrap=True )\n",
    "\n",
    "clf.fit(X_train, y_train)\n",
    "print(clf)\n",
    "yhat_proba_test = clf.predict_proba(X_test)[:,1]\n",
    "yhat_proba_train = clf.predict_proba(X_train)[:,1]\n",
    "\n",
    "print(\"\\n -- ROC AUC Score test {:.4f}\".format(roc_auc_score(y_test, yhat_proba_test)))\n",
    "print(\"\\n -- ROC AUC Score train {:.4f}\".format(roc_auc_score(y_train, yhat_proba_train)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.13569291,  0.43813153,  0.03770966,  0.02240128,  0.01700829,\n",
       "        0.10011347,  0.01994288,  0.0378396 ,  0.19116038])"
      ]
     },
     "execution_count": 175,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.feature_importances_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked',\n",
       "       'home.dest', 'title'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 176,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}