{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# warningsを無視する\n", "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5.1.1 欠損値の処理" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df_train = pd.read_csv(\"./titanic_csv/train.csv\")\n", "df_test = pd.read_csv(\"./titanic_csv/test.csv\")\n", "df_gender_submission = pd.read_csv(\"./titanic_csv/gender_submission.csv\")" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5.1.1.1 年齢 (Age)の補完" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "29.69911764705882" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train['Age'].mean() # 年齢の平均値を算出" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# 'Age'の欠損値に30を代入する。\n", "df_train['Age'] = df_train['Age'].fillna(30)\n", "df_test['Age'] = df_test['Age'].fillna(30)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
616211Icard, Miss. Ameliefemale38.00011357280.0B28NaN
82983011Stone, Mrs. George Nelson (Martha Evelyn)female62.00011357280.0B28NaN
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass Name \\\n", "61 62 1 1 Icard, Miss. Amelie \n", "829 830 1 1 Stone, Mrs. George Nelson (Martha Evelyn) \n", "\n", " Sex Age SibSp Parch Ticket Fare Cabin Embarked \n", "61 female 38.0 0 0 113572 80.0 B28 NaN \n", "829 female 62.0 0 0 113572 80.0 B28 NaN " ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# df_trainでEmbarkedが欠損のデータを表示\n", "df_train[df_train['Embarked'].isnull()]" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
616211Icard, Miss. Ameliefemale38.00011357280.0B28NaN
82983011Stone, Mrs. George Nelson (Martha Evelyn)female62.00011357280.0B28NaN
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass Name \\\n", "61 62 1 1 Icard, Miss. Amelie \n", "829 830 1 1 Stone, Mrs. George Nelson (Martha Evelyn) \n", "\n", " Sex Age SibSp Parch Ticket Fare Cabin Embarked \n", "61 female 38.0 0 0 113572 80.0 B28 NaN \n", "829 female 62.0 0 0 113572 80.0 B28 NaN " ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train[df_train['Ticket'] == '113572'] " ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [PassengerId, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked]\n", "Index: []" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_test[df_test['Ticket'] == '113572']" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
616211Icard, Miss. Ameliefemale38.00011357280.0B28C
82983011Stone, Mrs. George Nelson (Martha Evelyn)female62.00011357280.0B28C
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass Name \\\n", "61 62 1 1 Icard, Miss. Amelie \n", "829 830 1 1 Stone, Mrs. George Nelson (Martha Evelyn) \n", "\n", " Sex Age SibSp Parch Ticket Fare Cabin Embarked \n", "61 female 38.0 0 0 113572 80.0 B28 C \n", "829 female 62.0 0 0 113572 80.0 B28 C " ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 欠損値を'C'で埋め、表示して確認\n", "df_train.loc[df_train['PassengerId'].isin([62, 830]), 'Embarked'] = 'C'\n", "df_train.loc[df_train['PassengerId'].isin([62, 830])]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5.1.1.3 運賃 (Fare)の補完" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Fare
Pclass
184.154687
220.662183
313.675550
\n", "
" ], "text/plain": [ " Fare\n", "Pclass \n", "1 84.154687\n", "2 20.662183\n", "3 13.675550" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# PclassごとにFareの平均値を表示\n", "df_train[['Pclass','Fare']].groupby('Pclass').mean()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
15210443Storey, Mr. Thomasmale60.5003701NaNNaNS
\n", "
" ], "text/plain": [ " PassengerId Pclass Name Sex Age SibSp Parch Ticket \\\n", "152 1044 3 Storey, Mr. Thomas male 60.5 0 0 3701 \n", "\n", " Fare Cabin Embarked \n", "152 NaN NaN S " ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 欠損値があるレコードを確認\n", "df_test[df_test['Fare'].isnull()]" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
15210443Storey, Mr. Thomasmale60.500370113.67555NaNS
\n", "
" ], "text/plain": [ " PassengerId Pclass Name Sex Age SibSp Parch Ticket \\\n", "152 1044 3 Storey, Mr. Thomas male 60.5 0 0 3701 \n", "\n", " Fare Cabin Embarked \n", "152 13.67555 NaN S " ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_test.loc[df_test['PassengerId'] == 1044, 'Fare'] = 13.675550\n", "df_test[df_test['PassengerId'] == 1044]" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--df_trainの欠損値--\n", "PassengerId 0\n", "Survived 0\n", "Pclass 0\n", "Name 0\n", "Sex 0\n", "Age 0\n", "SibSp 0\n", "Parch 0\n", "Ticket 0\n", "Fare 0\n", "Cabin 687\n", "Embarked 0\n", "dtype: int64\n", "----------\n", "--df_testの欠損値--\n", "PassengerId 0\n", "Pclass 0\n", "Name 0\n", "Sex 0\n", "Age 0\n", "SibSp 0\n", "Parch 0\n", "Ticket 0\n", "Fare 0\n", "Cabin 327\n", "Embarked 0\n", "dtype: int64\n" ] } ], "source": [ "print('--df_trainの欠損値--')\n", "print(df_train.isnull().sum()) # df_trainの欠損値を表示\n", "print('-'*10 )\n", "print('--df_testの欠損値--')\n", "print(df_test.isnull().sum()) # df_testの欠損値を表示" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5.1.2 カテゴリ変数への変換" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5.1.2.1 Sex (性別)の変換" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": true }, "outputs": [], "source": [ "genders = {'male': 0, 'female': 1} # 辞書を作成
", "\n", "# Sexをgendersを用いて変換\n", "df_train['Sex'] = df_train['Sex'].map(genders)\n", "df_test['Sex'] = df_test['Sex'].map(genders)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5.1.2.2 Embarked (乗船した港)の変換" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# ダミー変数化\n", "df_train = pd.get_dummies(df_train, columns=['Embarked'])\n", "df_test = pd.get_dummies(df_test, columns = ['Embarked'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5.1.3 不要な列の削除" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df_train.drop(['Name', 'Cabin', 'Ticket'], axis=1, inplace=True)\n", "df_test.drop(['Name', 'Cabin', 'Ticket'], axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassSexAgeSibSpParchFareEmbarked_CEmbarked_QEmbarked_S
0103022.0107.25000.00.01.0
1211138.01071.28331.00.00.0
2313126.0007.92500.00.01.0
3411135.01053.10000.00.01.0
4503035.0008.05000.00.01.0
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass Sex Age SibSp Parch Fare \\\n", "0 1 0 3 0 22.0 1 0 7.2500 \n", "1 2 1 1 1 38.0 1 0 71.2833 \n", "2 3 1 3 1 26.0 0 0 7.9250 \n", "3 4 1 1 1 35.0 1 0 53.1000 \n", "4 5 0 3 0 35.0 0 0 8.0500 \n", "\n", " Embarked_C Embarked_Q Embarked_S \n", "0 0.0 0.0 1.0 \n", "1 1.0 0.0 0.0 \n", "2 0.0 0.0 1.0 \n", "3 0.0 0.0 1.0 \n", "4 0.0 0.0 1.0 " ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5.2識別器に学習させて予測" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": true }, "outputs": [], "source": [ "X_train = df_train.drop([\"PassengerId\", \"Survived\"], axis=1) # 不要な列を削除\n", "Y_train = df_train['Survived'] # Y_trainは、df_trainのSurvived列\n", "X_test = df_test.drop('PassengerId', axis=1).copy()" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "96.07 %\n" ] } ], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", "# ランダムフォレストのインスタンスを作成\n", "forest = RandomForestClassifier(random_state=1)\n", "\n", "# X_trainからY_trainを予測するように学習\n", "forest.fit(X_train,Y_train)\n", "\n", "# 正解率を表示\n", "acc_log = round(forest.score(X_train, Y_train) * 100, 2)\n", "print(round(acc_log,2,), '%')" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.model_selection import StratifiedKFold\n", "from sklearn.metrics import accuracy_score" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "79.12\n", "81.14\n", "81.14\n" ] } ], "source": [ "# 3分割交差検証を指定し、インスタンス化\n", "skf = StratifiedKFold(n_splits=3)\n", "\n", "# skf.split(X_train.Ytrain)で、X_trainとY_trainを3分割し、交差検証をする\n", "for train_index, test_index in skf.split(X_train, Y_train):\n", " X_cv_train = X_train.iloc[train_index]\n", " X_cv_test = X_train.iloc[test_index]\n", " y_cv_train = Y_train.iloc[train_index]\n", " y_cv_test = Y_train.iloc[test_index]\n", " forest = RandomForestClassifier(random_state=1)\n", " forest.fit(X_cv_train, y_cv_train) # 学習\n", " predictions = forest.predict(X_cv_test) # 予測\n", " # acuuracyを表示\n", " print(round(accuracy_score(y_cv_test,forest.predict(X_cv_test))*100,2))" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# 学習と予測を行う\n", "forest = RandomForestClassifier(random_state=1)\n", "forest.fit(X_train, Y_train)\n", "Y_prediction = forest.predict(X_test)\n", "submission = pd.DataFrame({\n", " 'PassengerId': df_test['PassengerId'],\n", " 'Survived': Y_prediction\n", " })\n", "submission.to_csv('submission.csv', index=False)\n" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" }, "toc": { "colors": { "hover_highlight": "#DAA520", "running_highlight": "#FF0000", "selected_highlight": "#FFD700" }, "moveMenuLeft": true, "nav_menu": { "height": "318px", "width": "252px" }, "navigate_menu": true, "number_sections": false, "sideBar": true, "threshold": 4, "toc_cell": false, "toc_section_display": "block", "toc_window_display": true, "widenNotebook": false } }, "nbformat": 4, "nbformat_minor": 1 }