{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Titanic with Decision Tree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "code_folding": [],
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "DATA_HOME_DIR = \"/home/tsu-nera/repo/kaggle/input/titanic/\"\n",
    "row_data = pd.read_csv(DATA_HOME_DIR + 'train.csv', index_col=0)\n",
    "test_data = pd.read_csv(DATA_HOME_DIR + 'test.csv', index_col=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Preprocess"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((1309, 6), (891, 1))"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_ind = test_data.index\n",
    "\n",
    "train_X = row_data[['Pclass','Sex','Age','SibSp','Parch','Cabin']]\n",
    "train_y = row_data[['Survived']]\n",
    "test_X= test_data[['Pclass','Sex','Age','SibSp','Parch', 'Cabin']]\n",
    "\n",
    "all_data = pd.concat([train_X, test_X])\n",
    "\n",
    "all_data.shape, train_y.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "## クラスごとに分割\n",
    "Pclass = pd.get_dummies(all_data['Pclass'])\n",
    "Pclass.columns=['1st','2nd','3rd']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "## 女性、男性、子供ごとに分割\n",
    "Sex = pd.get_dummies(all_data['Sex'])\n",
    "\n",
    "def male_female_child(passenger):\n",
    "    age,sex = passenger\n",
    "    if np.isnan(age):\n",
    "        age = 30\n",
    "    if age < 16:\n",
    "        return 'child'\n",
    "    else:\n",
    "        return sex\n",
    "\n",
    "Person = all_data[['Age','Sex']].apply(male_female_child,axis=1)\n",
    "Person = pd.get_dummies(Person)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# 独身かそうでないかで分類\n",
    "Alone = all_data.Parch + all_data.SibSp\n",
    "\n",
    "def is_alone(alone):\n",
    "    if alone > 0:\n",
    "        return 0\n",
    "    else:\n",
    "        return 1\n",
    "\n",
    "Alone = Alone.apply(is_alone)\n",
    "Alone = pd.DataFrame(Alone)\n",
    "Alone.columns = ['Alone']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {
    "code_folding": [],
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "def get_level(deck):\n",
    "    if pd.isnull(deck):\n",
    "        deck = 'CXX'\n",
    "    return deck[0]\n",
    "\n",
    "Level = all_data.Cabin.apply(get_level)\n",
    "Level = pd.get_dummies(Level)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {
    "code_folding": [],
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((891, 15), (891,), (418, 15))"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merge_data = pd.merge(Alone,Pclass,right_index=True,left_index=True)\n",
    "merge_data = pd.merge(merge_data,Person,right_index=True,left_index=True)\n",
    "merge_data = pd.merge(merge_data,Level,right_index=True,left_index=True)\n",
    "\n",
    "X = merge_data[:train_X.shape[0]]\n",
    "y = train_y.values.ravel()\n",
    "\n",
    "test_X = merge_data[train_X.shape[0]:]\n",
    "\n",
    "X.shape, y.shape, test_X.shape\n",
    "# tx"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Build Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "code_folding": [],
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# create model\n",
    "from sklearn import tree\n",
    "clf = tree.DecisionTreeClassifier(random_state=17)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n",
       "            max_features=None, max_leaf_nodes=None,\n",
       "            min_impurity_split=1e-07, min_samples_leaf=1,\n",
       "            min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "            presort=False, random_state=17, splitter='best')"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.fit(X, y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import cross_val_score\n",
    "scores = cross_val_score(clf, X, y, cv=5, scoring=\"accuracy\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.80248958520625846"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores.mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluate with KFold¶"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import KFold\n",
    "from sklearn import metrics\n",
    "from sklearn.metrics import accuracy_score\n",
    "K = 5\n",
    "kf = KFold(n_splits=K, shuffle=True, random_state=17)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "score_train_tmp = 0\n",
    "score_test_tmp = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X = np.array(X)\n",
    "y = np.array(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for train_index, test_index in kf.split(X):\n",
    "    X_train, X_test = X[train_index], X[test_index]\n",
    "    y_train, y_test = y[train_index], y[test_index]\n",
    "\n",
    "    # 構築データでモデル構築\n",
    "    clf.fit(X_train, y_train)\n",
    "\n",
    "    # 構築データの予測値\n",
    "    pred_train = clf.predict(X_train)\n",
    "\n",
    "    # 構築データのaccuracy\n",
    "    auccuracy = accuracy_score(pred_train, y_train)\n",
    "\n",
    "    #構築データのaccuracyを足していく\n",
    "    score_train_tmp+=auccuracy\n",
    "\n",
    "    #検証データの予測値\n",
    "    pred_test = clf.predict(X_test)\n",
    "\n",
    "    #検証データのaccuracy\n",
    "    auccuracy = accuracy_score(pred_test, y_test)\n",
    "\n",
    "    #検証データのaccuracyを足していく\n",
    "    score_test_tmp+=auccuracy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.82463676190176005"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "score_train_tmp/K"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.80247944259619608"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "score_test_tmp/K"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## GridSearch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best Model Parameter:  {'criterion': 'gini', 'max_depth': 6, 'max_features': 'log2', 'min_samples_leaf': 8, 'min_samples_split': 2}\n",
      "Best Model Score    :  0.812570145903\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "# use a full grid over all parameters\n",
    "param_grid = {\"max_depth\": [2,4,6,8,10],\n",
    "              \"max_features\": ['log2', 'sqrt','auto'],\n",
    "              \"min_samples_split\": [2, 3, 5],\n",
    "              \"min_samples_leaf\": [1,5,8],\n",
    "              \"criterion\": [\"gini\", \"entropy\"]}\n",
    "\n",
    "tree_grid = GridSearchCV(estimator=clf,\n",
    "                 param_grid = param_grid,   \n",
    "                 scoring=\"accuracy\",  #metrics\n",
    "                 cv = K,              #cross-validation\n",
    "                 n_jobs =-1)          #number of core\n",
    "\n",
    "tree_grid.fit(X,y) #fit\n",
    "\n",
    "tree_grid_best = tree_grid.best_estimator_ #best estimator\n",
    "print(\"Best Model Parameter: \",tree_grid.best_params_)\n",
    "print(\"Best Model Score    : \",tree_grid.best_score_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Testing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model = tree.DecisionTreeClassifier(criterion=\"gini\", max_depth=6, max_features=\"log2\", min_samples_leaf=8, min_samples_split=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,\n",
       "            max_features='log2', max_leaf_nodes=None,\n",
       "            min_impurity_split=1e-07, min_samples_leaf=8,\n",
       "            min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "            presort=False, random_state=None, splitter='best')"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "p_survived = model.predict(test_X.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "submission = pd.DataFrame()\n",
    "submission['PassengerId'] = test_ind\n",
    "submission['Survived'] = p_survived"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "submission.to_csv('submission_1001_1.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  },
  "notify_time": "10",
  "toc": {
   "colors": {
    "hover_highlight": "#DAA520",
    "navigate_num": "#000000",
    "navigate_text": "#333333",
    "running_highlight": "#FF0000",
    "selected_highlight": "#FFD700",
    "sidebar_border": "#EEEEEE",
    "wrapper_background": "#FFFFFF"
   },
   "moveMenuLeft": true,
   "nav_menu": {
    "height": "90px",
    "width": "253px"
   },
   "navigate_menu": true,
   "number_sections": false,
   "sideBar": true,
   "threshold": 4,
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": false,
   "widenNotebook": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}