{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Bagging trees on Titanic\n", "\n", "\n", "sklearn.tree.DecisionTreeClassifier\n", "\n", "* Creer les train et test sets\n", "* comme baseline: arbre de decision simple, not pruned, quel accuracy sur le test set ?\n", "* maintenant prendre 20 arbres, en limitant la taille a 2 niveaux\n", "* pour chaque arbre, predire les probas des echantillons du test set\n", "* puis moyenner les proba et utiliser le resultat pour determiner la classe predite.\n", "* quel accuracy sur le test set ?" ] }, { "cell_type": "code", "execution_count": 112, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import pandas as pd\n", "import numpy as np\n", "import re\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.metrics import confusion_matrix, classification_report\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import roc_auc_score" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1309, 12)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pclasssurvivednamesexagesibspparchticketfarecabinembarkedhome.dest
011Allen, Miss. Elisabeth Waltonfemale29.000024160211.3375B5SSt Louis, MO
111Allison, Master. Hudson Trevormale0.9212113781151.5500C22 C26SMontreal, PQ / Chesterville, ON
210Allison, Miss. Helen Lorainefemale2.0012113781151.5500C22 C26SMontreal, PQ / Chesterville, ON
310Allison, Mr. Hudson Joshua Creightonmale30.0012113781151.5500C22 C26SMontreal, PQ / Chesterville, ON
410Allison, Mrs. Hudson J C (Bessie Waldo Daniels)female25.0012113781151.5500C22 C26SMontreal, PQ / Chesterville, ON
\n", "
" ], "text/plain": [ " pclass survived name sex \\\n", "0 1 1 Allen, Miss. Elisabeth Walton female \n", "1 1 1 Allison, Master. Hudson Trevor male \n", "2 1 0 Allison, Miss. Helen Loraine female \n", "3 1 0 Allison, Mr. Hudson Joshua Creighton male \n", "4 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female \n", "\n", " age sibsp parch ticket fare cabin embarked \\\n", "0 29.00 0 0 24160 211.3375 B5 S \n", "1 0.92 1 2 113781 151.5500 C22 C26 S \n", "2 2.00 1 2 113781 151.5500 C22 C26 S \n", "3 30.00 1 2 113781 151.5500 C22 C26 S \n", "4 25.00 1 2 113781 151.5500 C22 C26 S \n", "\n", " home.dest \n", "0 St Louis, MO \n", "1 Montreal, PQ / Chesterville, ON \n", "2 Montreal, PQ / Chesterville, ON \n", "3 Montreal, PQ / Chesterville, ON \n", "4 Montreal, PQ / Chesterville, ON " ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('../data/classification/titanic.csv', sep = ';')\n", "print(df.shape)\n", "df.sample(frac = 1)\n", "df.head()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Preprocessing\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 114, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Age\n", "df.loc[df.age.isnull(), 'age'] = np.mean(df.age)\n", "\n", "# Fare\n", "df.loc[df.fare.isnull(), 'fare'] = np.mean(df.fare)\n", "\n", "\n", "# Choix arbitraire\n", "df.loc[df.embarked.isnull(), 'embarked'] = 'C'\n", "\n", "# Choix arbitraire\n", "df.loc[df['home.dest'].isnull(), 'home.dest'] = 'Autre'\n", "\n", "\n", "# extract title\n", "df['title'] = df.name.apply(lambda x: re.search(' ([A-Z][a-z]+)\\.', x).group(1))\n", "\n", "for col in ['sex', 'embarked', 'home.dest', 'title']:\n", " le = LabelEncoder()\n", " df[col] = le.fit_transform(df[col])\n", " \n", "\n", "# drop useless columns \n", "df.drop(columns = ['name','cabin', 'ticket'], inplace = True)\n", "\n", "X = df.drop(columns = ['survived'], inplace = False)\n", "y = df.survived\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y" ] }, { "cell_type": "code", "execution_count": 117, "metadata": { "collapsed": true }, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Baseline\n", "\n", "comme baseline: arbre de decision simple, not pruned, quel accuracy sur le test set \n", "On voit bien l'overfitting" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " -- ROC AUC Score test 0.7463\n", "\n", " -- ROC AUC Score train 0.9991\n" ] } ], "source": [ "clf = DecisionTreeClassifier()\n", "clf.fit(X_train, y_train)\n", "\n", "\n", "\n", "yhat_proba_test = clf.predict_proba(X_test)[:,1]\n", "yhat_proba_train = clf.predict_proba(X_train)[:,1]\n", "\n", "\n", "print(\"\\n -- ROC AUC Score test {:.4f}\".format(roc_auc_score(y_test, yhat_proba_test)))\n", "print(\"\\n -- ROC AUC Score train {:.4f}\".format(roc_auc_score(y_train, yhat_proba_train)))\n", "\n", "\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# N arbres max_depth = 2\n", "\n", "maintenant prendre 20 arbres, en limitant la taille a 2 niveaux\n", "\n", "A chaque iteration " ] }, { "cell_type": "code", "execution_count": 152, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " -- ROC AUC Score test Bagging 0.8608\n", "\n", " -- ROC AUC Score train 0.5000 0.0000 \n", "\n", " -- ROC AUC Score test 0.5000 0.0000 \n" ] } ], "source": [ "N = 200\n", "\n", "yhat_proba = [0] * len(y_test)\n", "\n", "roc_auc_train, roc_auc_test = [], []\n", "\n", "for n in range(N):\n", " idx = X_train.sample(frac = 0.2, replace = True).index\n", "\n", " clf = DecisionTreeClassifier(min_samples_leaf = 100)\n", "\n", " clf.fit(X_train.loc[idx], y_train[idx])\n", " \n", " yhat_proba_test += clf.predict_proba(X_test)[:,1]\n", " \n", " roc_auc_train.append(roc_auc_score(y_train, clf.predict_proba(X_train)[:,1] ))\n", " roc_auc_test.append(roc_auc_score(y_test, clf.predict_proba(X_test)[:,1] ))\n", "\n", "yhat_proba_test = yhat_proba_test / N \n", "\n", "\n", "print(\"\\n -- ROC AUC Score test Bagging {:.4f}\".format(roc_auc_score(y_test, yhat_proba_test)))\n", "print(\"\\n -- ROC AUC Score train {:.4f} {:.4f} \".format(np.mean(roc_auc_train), np.std(roc_auc_train)))\n", "print(\"\\n -- ROC AUC Score test {:.4f} {:.4f} \".format(np.mean(roc_auc_test), np.std(roc_auc_test)))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "\n", "fig, ax = plt.subplots(1,1)\n", "plt.boxplot(roc_auc_train);\n", "fig, ax = plt.subplots(1,1)\n", "plt.boxplot(roc_auc_test);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "![](https://gph.is/2c69qZB)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "![](https://gph.is/2c69qZB)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Random Forest" ] }, { "cell_type": "code", "execution_count": 174, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", " max_depth=3, max_features='auto', max_leaf_nodes=None,\n", " min_impurity_decrease=0.0, min_impurity_split=None,\n", " min_samples_leaf=1, min_samples_split=2,\n", " min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,\n", " oob_score=False, random_state=None, verbose=0,\n", " warm_start=False)\n", "\n", " -- ROC AUC Score test 0.8725\n", "\n", " -- ROC AUC Score train 0.8504\n" ] } ], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", "clf = RandomForestClassifier(max_depth = 3,n_estimators=100, bootstrap=True )\n", "\n", "clf.fit(X_train, y_train)\n", "print(clf)\n", "yhat_proba_test = clf.predict_proba(X_test)[:,1]\n", "yhat_proba_train = clf.predict_proba(X_train)[:,1]\n", "\n", "print(\"\\n -- ROC AUC Score test {:.4f}\".format(roc_auc_score(y_test, yhat_proba_test)))\n", "print(\"\\n -- ROC AUC Score train {:.4f}\".format(roc_auc_score(y_train, yhat_proba_train)))\n" ] }, { "cell_type": "code", "execution_count": 175, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0.13569291, 0.43813153, 0.03770966, 0.02240128, 0.01700829,\n", " 0.10011347, 0.01994288, 0.0378396 , 0.19116038])" ] }, "execution_count": 175, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clf.feature_importances_" ] }, { "cell_type": "code", "execution_count": 176, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked',\n", " 'home.dest', 'title'],\n", " dtype='object')" ] }, "execution_count": 176, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 2 }