{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# warningsを無視する\n",
"import warnings\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 5.1.1 欠損値の処理"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_train = pd.read_csv(\"./titanic_csv/train.csv\")\n",
"df_test = pd.read_csv(\"./titanic_csv/test.csv\")\n",
"df_gender_submission = pd.read_csv(\"./titanic_csv/gender_submission.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import matplotlib as mpl\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5.1.1.1 年齢 (Age)の補完"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"29.69911764705882"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train['Age'].mean() # 年齢の平均値を算出"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# 'Age'の欠損値に30を代入する。\n",
"df_train['Age'] = df_train['Age'].fillna(30)\n",
"df_test['Age'] = df_test['Age'].fillna(30)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" PassengerId | \n",
" Survived | \n",
" Pclass | \n",
" Name | \n",
" Sex | \n",
" Age | \n",
" SibSp | \n",
" Parch | \n",
" Ticket | \n",
" Fare | \n",
" Cabin | \n",
" Embarked | \n",
"
\n",
" \n",
" \n",
" \n",
" 61 | \n",
" 62 | \n",
" 1 | \n",
" 1 | \n",
" Icard, Miss. Amelie | \n",
" female | \n",
" 38.0 | \n",
" 0 | \n",
" 0 | \n",
" 113572 | \n",
" 80.0 | \n",
" B28 | \n",
" NaN | \n",
"
\n",
" \n",
" 829 | \n",
" 830 | \n",
" 1 | \n",
" 1 | \n",
" Stone, Mrs. George Nelson (Martha Evelyn) | \n",
" female | \n",
" 62.0 | \n",
" 0 | \n",
" 0 | \n",
" 113572 | \n",
" 80.0 | \n",
" B28 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PassengerId Survived Pclass Name \\\n",
"61 62 1 1 Icard, Miss. Amelie \n",
"829 830 1 1 Stone, Mrs. George Nelson (Martha Evelyn) \n",
"\n",
" Sex Age SibSp Parch Ticket Fare Cabin Embarked \n",
"61 female 38.0 0 0 113572 80.0 B28 NaN \n",
"829 female 62.0 0 0 113572 80.0 B28 NaN "
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# df_trainでEmbarkedが欠損のデータを表示\n",
"df_train[df_train['Embarked'].isnull()]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PassengerId | \n",
" Survived | \n",
" Pclass | \n",
" Name | \n",
" Sex | \n",
" Age | \n",
" SibSp | \n",
" Parch | \n",
" Ticket | \n",
" Fare | \n",
" Cabin | \n",
" Embarked | \n",
"
\n",
" \n",
" \n",
" \n",
" 61 | \n",
" 62 | \n",
" 1 | \n",
" 1 | \n",
" Icard, Miss. Amelie | \n",
" female | \n",
" 38.0 | \n",
" 0 | \n",
" 0 | \n",
" 113572 | \n",
" 80.0 | \n",
" B28 | \n",
" NaN | \n",
"
\n",
" \n",
" 829 | \n",
" 830 | \n",
" 1 | \n",
" 1 | \n",
" Stone, Mrs. George Nelson (Martha Evelyn) | \n",
" female | \n",
" 62.0 | \n",
" 0 | \n",
" 0 | \n",
" 113572 | \n",
" 80.0 | \n",
" B28 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PassengerId Survived Pclass Name \\\n",
"61 62 1 1 Icard, Miss. Amelie \n",
"829 830 1 1 Stone, Mrs. George Nelson (Martha Evelyn) \n",
"\n",
" Sex Age SibSp Parch Ticket Fare Cabin Embarked \n",
"61 female 38.0 0 0 113572 80.0 B28 NaN \n",
"829 female 62.0 0 0 113572 80.0 B28 NaN "
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train[df_train['Ticket'] == '113572'] "
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PassengerId | \n",
" Pclass | \n",
" Name | \n",
" Sex | \n",
" Age | \n",
" SibSp | \n",
" Parch | \n",
" Ticket | \n",
" Fare | \n",
" Cabin | \n",
" Embarked | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [PassengerId, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked]\n",
"Index: []"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_test[df_test['Ticket'] == '113572']"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PassengerId | \n",
" Survived | \n",
" Pclass | \n",
" Name | \n",
" Sex | \n",
" Age | \n",
" SibSp | \n",
" Parch | \n",
" Ticket | \n",
" Fare | \n",
" Cabin | \n",
" Embarked | \n",
"
\n",
" \n",
" \n",
" \n",
" 61 | \n",
" 62 | \n",
" 1 | \n",
" 1 | \n",
" Icard, Miss. Amelie | \n",
" female | \n",
" 38.0 | \n",
" 0 | \n",
" 0 | \n",
" 113572 | \n",
" 80.0 | \n",
" B28 | \n",
" C | \n",
"
\n",
" \n",
" 829 | \n",
" 830 | \n",
" 1 | \n",
" 1 | \n",
" Stone, Mrs. George Nelson (Martha Evelyn) | \n",
" female | \n",
" 62.0 | \n",
" 0 | \n",
" 0 | \n",
" 113572 | \n",
" 80.0 | \n",
" B28 | \n",
" C | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PassengerId Survived Pclass Name \\\n",
"61 62 1 1 Icard, Miss. Amelie \n",
"829 830 1 1 Stone, Mrs. George Nelson (Martha Evelyn) \n",
"\n",
" Sex Age SibSp Parch Ticket Fare Cabin Embarked \n",
"61 female 38.0 0 0 113572 80.0 B28 C \n",
"829 female 62.0 0 0 113572 80.0 B28 C "
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 欠損値を'C'で埋め、表示して確認\n",
"df_train.loc[df_train['PassengerId'].isin([62, 830]), 'Embarked'] = 'C'\n",
"df_train.loc[df_train['PassengerId'].isin([62, 830])]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5.1.1.3 運賃 (Fare)の補完"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Fare | \n",
"
\n",
" \n",
" Pclass | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 84.154687 | \n",
"
\n",
" \n",
" 2 | \n",
" 20.662183 | \n",
"
\n",
" \n",
" 3 | \n",
" 13.675550 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Fare\n",
"Pclass \n",
"1 84.154687\n",
"2 20.662183\n",
"3 13.675550"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# PclassごとにFareの平均値を表示\n",
"df_train[['Pclass','Fare']].groupby('Pclass').mean()"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PassengerId | \n",
" Pclass | \n",
" Name | \n",
" Sex | \n",
" Age | \n",
" SibSp | \n",
" Parch | \n",
" Ticket | \n",
" Fare | \n",
" Cabin | \n",
" Embarked | \n",
"
\n",
" \n",
" \n",
" \n",
" 152 | \n",
" 1044 | \n",
" 3 | \n",
" Storey, Mr. Thomas | \n",
" male | \n",
" 60.5 | \n",
" 0 | \n",
" 0 | \n",
" 3701 | \n",
" NaN | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PassengerId Pclass Name Sex Age SibSp Parch Ticket \\\n",
"152 1044 3 Storey, Mr. Thomas male 60.5 0 0 3701 \n",
"\n",
" Fare Cabin Embarked \n",
"152 NaN NaN S "
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 欠損値があるレコードを確認\n",
"df_test[df_test['Fare'].isnull()]"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PassengerId | \n",
" Pclass | \n",
" Name | \n",
" Sex | \n",
" Age | \n",
" SibSp | \n",
" Parch | \n",
" Ticket | \n",
" Fare | \n",
" Cabin | \n",
" Embarked | \n",
"
\n",
" \n",
" \n",
" \n",
" 152 | \n",
" 1044 | \n",
" 3 | \n",
" Storey, Mr. Thomas | \n",
" male | \n",
" 60.5 | \n",
" 0 | \n",
" 0 | \n",
" 3701 | \n",
" 13.67555 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PassengerId Pclass Name Sex Age SibSp Parch Ticket \\\n",
"152 1044 3 Storey, Mr. Thomas male 60.5 0 0 3701 \n",
"\n",
" Fare Cabin Embarked \n",
"152 13.67555 NaN S "
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_test.loc[df_test['PassengerId'] == 1044, 'Fare'] = 13.675550\n",
"df_test[df_test['PassengerId'] == 1044]"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--df_trainの欠損値--\n",
"PassengerId 0\n",
"Survived 0\n",
"Pclass 0\n",
"Name 0\n",
"Sex 0\n",
"Age 0\n",
"SibSp 0\n",
"Parch 0\n",
"Ticket 0\n",
"Fare 0\n",
"Cabin 687\n",
"Embarked 0\n",
"dtype: int64\n",
"----------\n",
"--df_testの欠損値--\n",
"PassengerId 0\n",
"Pclass 0\n",
"Name 0\n",
"Sex 0\n",
"Age 0\n",
"SibSp 0\n",
"Parch 0\n",
"Ticket 0\n",
"Fare 0\n",
"Cabin 327\n",
"Embarked 0\n",
"dtype: int64\n"
]
}
],
"source": [
"print('--df_trainの欠損値--')\n",
"print(df_train.isnull().sum()) # df_trainの欠損値を表示\n",
"print('-'*10 )\n",
"print('--df_testの欠損値--')\n",
"print(df_test.isnull().sum()) # df_testの欠損値を表示"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 5.1.2 カテゴリ変数への変換"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5.1.2.1 Sex (性別)の変換"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"genders = {'male': 0, 'female': 1} # 辞書を作成
",
"\n",
"# Sexをgendersを用いて変換\n",
"df_train['Sex'] = df_train['Sex'].map(genders)\n",
"df_test['Sex'] = df_test['Sex'].map(genders)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5.1.2.2 Embarked (乗船した港)の変換"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# ダミー変数化\n",
"df_train = pd.get_dummies(df_train, columns=['Embarked'])\n",
"df_test = pd.get_dummies(df_test, columns = ['Embarked'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 5.1.3 不要な列の削除"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df_train.drop(['Name', 'Cabin', 'Ticket'], axis=1, inplace=True)\n",
"df_test.drop(['Name', 'Cabin', 'Ticket'], axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PassengerId | \n",
" Survived | \n",
" Pclass | \n",
" Sex | \n",
" Age | \n",
" SibSp | \n",
" Parch | \n",
" Fare | \n",
" Embarked_C | \n",
" Embarked_Q | \n",
" Embarked_S | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 3 | \n",
" 0 | \n",
" 22.0 | \n",
" 1 | \n",
" 0 | \n",
" 7.2500 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 38.0 | \n",
" 1 | \n",
" 0 | \n",
" 71.2833 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 1 | \n",
" 3 | \n",
" 1 | \n",
" 26.0 | \n",
" 0 | \n",
" 0 | \n",
" 7.9250 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 35.0 | \n",
" 1 | \n",
" 0 | \n",
" 53.1000 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 0 | \n",
" 3 | \n",
" 0 | \n",
" 35.0 | \n",
" 0 | \n",
" 0 | \n",
" 8.0500 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PassengerId Survived Pclass Sex Age SibSp Parch Fare \\\n",
"0 1 0 3 0 22.0 1 0 7.2500 \n",
"1 2 1 1 1 38.0 1 0 71.2833 \n",
"2 3 1 3 1 26.0 0 0 7.9250 \n",
"3 4 1 1 1 35.0 1 0 53.1000 \n",
"4 5 0 3 0 35.0 0 0 8.0500 \n",
"\n",
" Embarked_C Embarked_Q Embarked_S \n",
"0 0.0 0.0 1.0 \n",
"1 1.0 0.0 0.0 \n",
"2 0.0 0.0 1.0 \n",
"3 0.0 0.0 1.0 \n",
"4 0.0 0.0 1.0 "
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_train.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 5.2識別器に学習させて予測"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X_train = df_train.drop([\"PassengerId\", \"Survived\"], axis=1) # 不要な列を削除\n",
"Y_train = df_train['Survived'] # Y_trainは、df_trainのSurvived列\n",
"X_test = df_test.drop('PassengerId', axis=1).copy()"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"96.07 %\n"
]
}
],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"# ランダムフォレストのインスタンスを作成\n",
"forest = RandomForestClassifier(random_state=1)\n",
"\n",
"# X_trainからY_trainを予測するように学習\n",
"forest.fit(X_train,Y_train)\n",
"\n",
"# 正解率を表示\n",
"acc_log = round(forest.score(X_train, Y_train) * 100, 2)\n",
"print(round(acc_log,2,), '%')"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.metrics import accuracy_score"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"79.12\n",
"81.14\n",
"81.14\n"
]
}
],
"source": [
"# 3分割交差検証を指定し、インスタンス化\n",
"skf = StratifiedKFold(n_splits=3)\n",
"\n",
"# skf.split(X_train.Ytrain)で、X_trainとY_trainを3分割し、交差検証をする\n",
"for train_index, test_index in skf.split(X_train, Y_train):\n",
" X_cv_train = X_train.iloc[train_index]\n",
" X_cv_test = X_train.iloc[test_index]\n",
" y_cv_train = Y_train.iloc[train_index]\n",
" y_cv_test = Y_train.iloc[test_index]\n",
" forest = RandomForestClassifier(random_state=1)\n",
" forest.fit(X_cv_train, y_cv_train) # 学習\n",
" predictions = forest.predict(X_cv_test) # 予測\n",
" # acuuracyを表示\n",
" print(round(accuracy_score(y_cv_test,forest.predict(X_cv_test))*100,2))"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# 学習と予測を行う\n",
"forest = RandomForestClassifier(random_state=1)\n",
"forest.fit(X_train, Y_train)\n",
"Y_prediction = forest.predict(X_test)\n",
"submission = pd.DataFrame({\n",
" 'PassengerId': df_test['PassengerId'],\n",
" 'Survived': Y_prediction\n",
" })\n",
"submission.to_csv('submission.csv', index=False)\n"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [conda root]",
"language": "python",
"name": "conda-root-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
},
"toc": {
"colors": {
"hover_highlight": "#DAA520",
"running_highlight": "#FF0000",
"selected_highlight": "#FFD700"
},
"moveMenuLeft": true,
"nav_menu": {
"height": "318px",
"width": "252px"
},
"navigate_menu": true,
"number_sections": false,
"sideBar": true,
"threshold": 4,
"toc_cell": false,
"toc_section_display": "block",
"toc_window_display": true,
"widenNotebook": false
}
},
"nbformat": 4,
"nbformat_minor": 1
}