{ "cells": [ { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# warningsを無視する\n", "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5.3.2 連続変数の離散化" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# カーネルの場合\n", "#df_train = pd.read_csv('../input/train.csv')\n", "#df_test = pd.read_csv('../input/test.csv')\n", "\n", "# 本レポジトリの場合\n", "df_train = pd.read_csv(\"./titanic_csv/train.csv\")\n", "df_test = pd.read_csv(\"./titanic_csv/test.csv\")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "% matplotlib inline\n", "\n", "# 日本語表示用 ##カーネルでは日本語表示できない\n", "plt.rcParams[\"font.size\"] = 18\n", "plt.rcParams['font.family'] = 'IPAPGothic' \n", "\n", "# サイズの設定\n", "plt.rcParams['figure.figsize'] = (8.0, 6.0)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Embarkedの補完\n", "df_train.loc[df_train['PassengerId'].isin([62, 830]), 'Embarked'] = 'C'\n", "\n", "# Fareの補完\n", "df_test.loc[df_test['PassengerId'] == 1044, 'Fare'] = 13.675550\n", "\n", "#Age変換のための関数\n", "def impute_age(cols):\n", " Age = cols[0]\n", " Pclass = cols[1]\n", " \n", " if pd.isnull(Age): \n", " if Pclass == 1:\n", " return 39\n", " elif Pclass == 2:\n", " return 30\n", " else:\n", " return 25 \n", " else:\n", " return Age\n", "\n", "\n", "data = [df_train, df_test]\n", "for df in data:\n", " # Ageの補完\n", " df['Age'] = df[['Age','Pclass']].apply(impute_age, axis = 1) \n", "\n", " # 性別の変換\n", " df['Sex'] = df['Sex'].map({\"male\": 0, \"female\": 1})\n", " \n", " # Embarked\n", " df['Embarked'] = df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)\n", "\n", "df_train = pd.get_dummies(df_train, columns = ['Embarked'])\n", "df_test = pd.get_dummies(df_test, columns = ['Embarked']) " ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": true }, "outputs": [], "source": [ "data = [df_train, df_test]\n", "for df in data: \n", " # Fareのカテゴリ変数化\n", " df.loc[ df['Fare'] <= 7.91, 'Fare'] = 0\n", " df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare'] = 1\n", " df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare'] = 2\n", " df.loc[ df['Fare'] > 31, 'Fare'] = 3\n", " df['Fare'] = df['Fare'].astype(int)\n", "\n", " # Ageのカテゴリ変数化\n", " df.loc[ df['Age'] <= 16, 'Age'] = 0\n", " df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age'] = 1\n", " df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age'] = 2\n", " df.loc[ df['Age'] > 48, 'Age'] = 3\n", " df['Age'] = df['Age'].astype(int)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df_train.drop(['Name', 'Cabin', 'Ticket'], axis=1, inplace=True)\n", "df_test.drop(['Name', 'Cabin', 'Ticket'], axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": true }, "outputs": [], "source": [ "X_train = df_train.drop([\"PassengerId\", \"Survived\"], axis=1) # 不要な列を削除\n", "Y_train = df_train['Survived'] # Y_trainは、df_trainのSurvived列\n", "X_test = df_test.drop('PassengerId', axis=1).copy()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", "# 学習と予測を行う\n", "forest = RandomForestClassifier(random_state=1)\n", "forest.fit(X_train, Y_train)\n", "Y_prediction = forest.predict(X_test)\n", "submission = pd.DataFrame({\n", " 'PassengerId': df_test['PassengerId'],\n", " 'Survived': Y_prediction\n", " })\n", "submission.to_csv('submission.csv', index=False)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5.3.3 特徴量の重要度を確認する" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0.13924783, 0.41004894, 0.11409453, 0.09676177, 0.07999719,\n", " 0.10232023, 0.02503838, 0.02067357, 0.01181757])" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "forest.feature_importances_" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pclass 0.1392\n", "Sex 0.41\n", "Age 0.1141\n", "SibSp 0.0968\n", "Parch 0.08\n", "Fare 0.1023\n", "Embarked_0 0.025\n", "Embarked_1 0.0207\n", "Embarked_2 0.0118\n" ] } ], "source": [ "for i,k in zip(X_train.columns,forest.feature_importances_):\n", " print(i,round(k,4))" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" }, "toc": { "colors": { "hover_highlight": "#DAA520", "running_highlight": "#FF0000", "selected_highlight": "#FFD700" }, "moveMenuLeft": true, "nav_menu": { "height": "318px", "width": "252px" }, "navigate_menu": true, "number_sections": false, "sideBar": true, "threshold": 4, "toc_cell": false, "toc_section_display": "block", "toc_window_display": true, "widenNotebook": false } }, "nbformat": 4, "nbformat_minor": 1 }