{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# warningsを無視する\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5.1.1 欠損値の処理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_train = pd.read_csv(\"./titanic_csv/train.csv\")\n",
    "df_test = pd.read_csv(\"./titanic_csv/test.csv\")\n",
    "df_gender_submission = pd.read_csv(\"./titanic_csv/gender_submission.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import matplotlib as mpl\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.1.1.1 年齢 (Age)の補完"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "29.69911764705882"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train['Age'].mean() # 年齢の平均値を算出"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 'Age'の欠損値に30を代入する。\n",
    "df_train['Age'] = df_train['Age'].fillna(30)\n",
    "df_test['Age'] = df_test['Age'].fillna(30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>61</th>\n",
       "      <td>62</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Icard, Miss. Amelie</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>113572</td>\n",
       "      <td>80.0</td>\n",
       "      <td>B28</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>829</th>\n",
       "      <td>830</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Stone, Mrs. George Nelson (Martha Evelyn)</td>\n",
       "      <td>female</td>\n",
       "      <td>62.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>113572</td>\n",
       "      <td>80.0</td>\n",
       "      <td>B28</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     PassengerId  Survived  Pclass                                       Name  \\\n",
       "61            62         1       1                        Icard, Miss. Amelie   \n",
       "829          830         1       1  Stone, Mrs. George Nelson (Martha Evelyn)   \n",
       "\n",
       "        Sex   Age  SibSp  Parch  Ticket  Fare Cabin Embarked  \n",
       "61   female  38.0      0      0  113572  80.0   B28      NaN  \n",
       "829  female  62.0      0      0  113572  80.0   B28      NaN  "
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# df_trainでEmbarkedが欠損のデータを表示\n",
    "df_train[df_train['Embarked'].isnull()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>61</th>\n",
       "      <td>62</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Icard, Miss. Amelie</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>113572</td>\n",
       "      <td>80.0</td>\n",
       "      <td>B28</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>829</th>\n",
       "      <td>830</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Stone, Mrs. George Nelson (Martha Evelyn)</td>\n",
       "      <td>female</td>\n",
       "      <td>62.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>113572</td>\n",
       "      <td>80.0</td>\n",
       "      <td>B28</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     PassengerId  Survived  Pclass                                       Name  \\\n",
       "61            62         1       1                        Icard, Miss. Amelie   \n",
       "829          830         1       1  Stone, Mrs. George Nelson (Martha Evelyn)   \n",
       "\n",
       "        Sex   Age  SibSp  Parch  Ticket  Fare Cabin Embarked  \n",
       "61   female  38.0      0      0  113572  80.0   B28      NaN  \n",
       "829  female  62.0      0      0  113572  80.0   B28      NaN  "
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train[df_train['Ticket'] == '113572'] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [PassengerId, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked]\n",
       "Index: []"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test[df_test['Ticket'] == '113572']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>61</th>\n",
       "      <td>62</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Icard, Miss. Amelie</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>113572</td>\n",
       "      <td>80.0</td>\n",
       "      <td>B28</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>829</th>\n",
       "      <td>830</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Stone, Mrs. George Nelson (Martha Evelyn)</td>\n",
       "      <td>female</td>\n",
       "      <td>62.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>113572</td>\n",
       "      <td>80.0</td>\n",
       "      <td>B28</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     PassengerId  Survived  Pclass                                       Name  \\\n",
       "61            62         1       1                        Icard, Miss. Amelie   \n",
       "829          830         1       1  Stone, Mrs. George Nelson (Martha Evelyn)   \n",
       "\n",
       "        Sex   Age  SibSp  Parch  Ticket  Fare Cabin Embarked  \n",
       "61   female  38.0      0      0  113572  80.0   B28        C  \n",
       "829  female  62.0      0      0  113572  80.0   B28        C  "
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 欠損値を'C'で埋め、表示して確認\n",
    "df_train.loc[df_train['PassengerId'].isin([62, 830]), 'Embarked'] = 'C'\n",
    "df_train.loc[df_train['PassengerId'].isin([62, 830])]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.1.1.3 運賃 (Fare)の補完"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Fare</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Pclass</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>84.154687</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>20.662183</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>13.675550</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             Fare\n",
       "Pclass           \n",
       "1       84.154687\n",
       "2       20.662183\n",
       "3       13.675550"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# PclassごとにFareの平均値を表示\n",
    "df_train[['Pclass','Fare']].groupby('Pclass').mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>152</th>\n",
       "      <td>1044</td>\n",
       "      <td>3</td>\n",
       "      <td>Storey, Mr. Thomas</td>\n",
       "      <td>male</td>\n",
       "      <td>60.5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3701</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     PassengerId  Pclass                Name   Sex   Age  SibSp  Parch Ticket  \\\n",
       "152         1044       3  Storey, Mr. Thomas  male  60.5      0      0   3701   \n",
       "\n",
       "     Fare Cabin Embarked  \n",
       "152   NaN   NaN        S  "
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 欠損値があるレコードを確認\n",
    "df_test[df_test['Fare'].isnull()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>152</th>\n",
       "      <td>1044</td>\n",
       "      <td>3</td>\n",
       "      <td>Storey, Mr. Thomas</td>\n",
       "      <td>male</td>\n",
       "      <td>60.5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3701</td>\n",
       "      <td>13.67555</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     PassengerId  Pclass                Name   Sex   Age  SibSp  Parch Ticket  \\\n",
       "152         1044       3  Storey, Mr. Thomas  male  60.5      0      0   3701   \n",
       "\n",
       "         Fare Cabin Embarked  \n",
       "152  13.67555   NaN        S  "
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test.loc[df_test['PassengerId'] == 1044, 'Fare'] = 13.675550\n",
    "df_test[df_test['PassengerId'] == 1044]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--df_trainの欠損値--\n",
      "PassengerId      0\n",
      "Survived         0\n",
      "Pclass           0\n",
      "Name             0\n",
      "Sex              0\n",
      "Age              0\n",
      "SibSp            0\n",
      "Parch            0\n",
      "Ticket           0\n",
      "Fare             0\n",
      "Cabin          687\n",
      "Embarked         0\n",
      "dtype: int64\n",
      "----------\n",
      "--df_testの欠損値--\n",
      "PassengerId      0\n",
      "Pclass           0\n",
      "Name             0\n",
      "Sex              0\n",
      "Age              0\n",
      "SibSp            0\n",
      "Parch            0\n",
      "Ticket           0\n",
      "Fare             0\n",
      "Cabin          327\n",
      "Embarked         0\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print('--df_trainの欠損値--')\n",
    "print(df_train.isnull().sum()) # df_trainの欠損値を表示\n",
    "print('-'*10 )\n",
    "print('--df_testの欠損値--')\n",
    "print(df_test.isnull().sum()) # df_testの欠損値を表示"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5.1.2 カテゴリ変数への変換"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.1.2.1 Sex (性別)の変換"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "genders = {'male': 0, 'female': 1} # 辞書を作成 ",
    "\n",
    "# Sexをgendersを用いて変換\n",
    "df_train['Sex'] = df_train['Sex'].map(genders)\n",
    "df_test['Sex'] = df_test['Sex'].map(genders)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.1.2.2 Embarked (乗船した港)の変換"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# ダミー変数化\n",
    "df_train = pd.get_dummies(df_train, columns=['Embarked'])\n",
    "df_test = pd.get_dummies(df_test, columns = ['Embarked'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5.1.3 不要な列の削除"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df_train.drop(['Name', 'Cabin', 'Ticket'], axis=1, inplace=True)\n",
    "df_test.drop(['Name', 'Cabin', 'Ticket'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   PassengerId  Survived  Pclass  Sex   Age  SibSp  Parch     Fare  \\\n",
       "0            1         0       3    0  22.0      1      0   7.2500   \n",
       "1            2         1       1    1  38.0      1      0  71.2833   \n",
       "2            3         1       3    1  26.0      0      0   7.9250   \n",
       "3            4         1       1    1  35.0      1      0  53.1000   \n",
       "4            5         0       3    0  35.0      0      0   8.0500   \n",
       "\n",
       "   Embarked_C  Embarked_Q  Embarked_S  \n",
       "0         0.0         0.0         1.0  \n",
       "1         1.0         0.0         0.0  \n",
       "2         0.0         0.0         1.0  \n",
       "3         0.0         0.0         1.0  \n",
       "4         0.0         0.0         1.0  "
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5.2識別器に学習させて予測"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train = df_train.drop([\"PassengerId\", \"Survived\"], axis=1) # 不要な列を削除\n",
    "Y_train = df_train['Survived'] # Y_trainは、df_trainのSurvived列\n",
    "X_test  = df_test.drop('PassengerId', axis=1).copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "96.07 %\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "# ランダムフォレストのインスタンスを作成\n",
    "forest = RandomForestClassifier(random_state=1)\n",
    "\n",
    "# X_trainからY_trainを予測するように学習\n",
    "forest.fit(X_train,Y_train)\n",
    "\n",
    "# 正解率を表示\n",
    "acc_log = round(forest.score(X_train, Y_train) * 100, 2)\n",
    "print(round(acc_log,2,), '%')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.metrics import accuracy_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "79.12\n",
      "81.14\n",
      "81.14\n"
     ]
    }
   ],
   "source": [
    "# 3分割交差検証を指定し、インスタンス化\n",
    "skf = StratifiedKFold(n_splits=3)\n",
    "\n",
    "# skf.split(X_train.Ytrain)で、X_trainとY_trainを3分割し、交差検証をする\n",
    "for train_index, test_index in skf.split(X_train, Y_train):\n",
    "    X_cv_train = X_train.iloc[train_index]\n",
    "    X_cv_test = X_train.iloc[test_index]\n",
    "    y_cv_train = Y_train.iloc[train_index]\n",
    "    y_cv_test = Y_train.iloc[test_index]\n",
    "    forest = RandomForestClassifier(random_state=1)\n",
    "    forest.fit(X_cv_train, y_cv_train) # 学習\n",
    "    predictions = forest.predict(X_cv_test) # 予測\n",
    "    # acuuracyを表示\n",
    "    print(round(accuracy_score(y_cv_test,forest.predict(X_cv_test))*100,2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 学習と予測を行う\n",
    "forest = RandomForestClassifier(random_state=1)\n",
    "forest.fit(X_train, Y_train)\n",
    "Y_prediction = forest.predict(X_test)\n",
    "submission = pd.DataFrame({\n",
    "        'PassengerId': df_test['PassengerId'],\n",
    "        'Survived': Y_prediction\n",
    "    })\n",
    "submission.to_csv('submission.csv', index=False)\n"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  },
  "toc": {
   "colors": {
    "hover_highlight": "#DAA520",
    "running_highlight": "#FF0000",
    "selected_highlight": "#FFD700"
   },
   "moveMenuLeft": true,
   "nav_menu": {
    "height": "318px",
    "width": "252px"
   },
   "navigate_menu": true,
   "number_sections": false,
   "sideBar": true,
   "threshold": 4,
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": true,
   "widenNotebook": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}