{ "cells": [ { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Titanic with Logistic Regression" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "code_folding": [], "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from pandas import DataFrame" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## setup" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "DATA_HOME_DIR = \"/home/tsu-nera/repo/kaggle/input/titanic/\"\n", "row_data = pd.read_csv(DATA_HOME_DIR + 'train.csv', index_col=0)\n", "test_data = pd.read_csv(DATA_HOME_DIR + 'test.csv', index_col=0)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Preprocess" ] }, { "cell_type": "code", "execution_count": 109, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "data": { "text/plain": [ "((1309, 6), (891, 1))" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_ind = test_data.index\n", "\n", "train_X = row_data[['Pclass','Sex','Age','SibSp','Parch','Cabin']]\n", "train_y = row_data[['Survived']]\n", "test_X= test_data[['Pclass','Sex','Age','SibSp','Parch', 'Cabin']]\n", "\n", "all_data = pd.concat([train_X, test_X])\n", "\n", "all_data.shape, train_y.shape" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "## クラスごとに分割\n", "Pclass = pd.get_dummies(all_data['Pclass'])\n", "Pclass.columns=['1st','2nd','3rd']" ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "## 女性、男性、子供ごとに分割\n", "Sex = pd.get_dummies(all_data['Sex'])\n", "\n", "def male_female_child(passenger):\n", " age,sex = passenger\n", " if np.isnan(age):\n", " age = 30\n", " if age < 16:\n", " return 'child'\n", " else:\n", " return sex\n", "\n", "Person = all_data[['Age','Sex']].apply(male_female_child,axis=1)\n", "Person = pd.get_dummies(Person)" ] }, { "cell_type": "code", "execution_count": 69, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "# 独身かそうでないかで分類\n", "Alone = all_data.Parch + all_data.SibSp\n", "\n", "def is_alone(alone):\n", " if alone > 0:\n", " return 0\n", " else:\n", " return 1\n", "\n", "Alone = Alone.apply(is_alone)\n", "Alone = pd.DataFrame(Alone)\n", "Alone.columns = ['Alone']" ] }, { "cell_type": "code", "execution_count": 67, "metadata": { "code_folding": [], "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "def get_level(deck):\n", " if pd.isnull(deck):\n", " deck = 'CXX'\n", " return deck[0]\n", "\n", "Level = all_data.Cabin.apply(get_level)\n", "Level = pd.get_dummies(Level)" ] }, { "cell_type": "code", "execution_count": 94, "metadata": { "code_folding": [], "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "data": { "text/plain": [ "((891, 15), (891,), (418, 15))" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merge_data = pd.merge(Alone,Pclass,right_index=True,left_index=True)\n", "merge_data = pd.merge(merge_data,Person,right_index=True,left_index=True)\n", "merge_data = pd.merge(merge_data,Level,right_index=True,left_index=True)\n", "\n", "X = merge_data[:train_X.shape[0]]\n", "y = train_y.values.ravel()\n", "\n", "test_X = merge_data[train_X.shape[0]:]\n", "\n", "X.shape, y.shape, test_X.shape\n", "# tx" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Build Model" ] }, { "cell_type": "code", "execution_count": 104, "metadata": { "code_folding": [], "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "# create model\n", "from sklearn.linear_model import LogisticRegression\n", "model = LogisticRegression(C=10)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Training" ] }, { "cell_type": "code", "execution_count": 105, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "data": { "text/plain": [ "LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", " verbose=0, warm_start=False)" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(X, y)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Testing" ] }, { "cell_type": "code", "execution_count": 106, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "p_survived = model.predict(test_X.values)" ] }, { "cell_type": "code", "execution_count": 107, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "submission = pd.DataFrame()\n", "submission['PassengerId'] = test_ind\n", "submission['Survived'] = p_survived" ] }, { "cell_type": "code", "execution_count": 108, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "submission.to_csv('submission.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" }, "notify_time": "10", "toc": { "colors": { "hover_highlight": "#DAA520", "navigate_num": "#000000", "navigate_text": "#333333", "running_highlight": "#FF0000", "selected_highlight": "#FFD700", "sidebar_border": "#EEEEEE", "wrapper_background": "#FFFFFF" }, "moveMenuLeft": true, "nav_menu": { "height": "90px", "width": "253px" }, "navigate_menu": true, "number_sections": false, "sideBar": true, "threshold": 4, "toc_cell": false, "toc_section_display": "block", "toc_window_display": false, "widenNotebook": false } }, "nbformat": 4, "nbformat_minor": 2 }