{ "cells": [ { "cell_type": "code", "execution_count": 292, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Playing with the classic dataset, originally inspired by Andrey Lukyanenko's work at https://github.com/Erlemar/Erlemar.github.io/blob/master/Notebooks/Titanic.ipynb\n", "import os.path\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "%reload_ext autoreload\n", "%autoreload 2\n", "%matplotlib inline\n", "import seaborn as sns\n", "sns.set_style('whitegrid')" ] }, { "cell_type": "code", "execution_count": 293, "metadata": { "collapsed": true }, "outputs": [], "source": [ "base_path = 'E:\\\\Projects\\\\titanic\\\\data'\n", "# Train / test data - start with data exploration, then circle back with train/test split\n", "df_train = pd.read_csv(os.path.join(base_path, 'train.csv'))\n", "# Kaggle test data\n", "df_test = pd.read_csv(os.path.join(base_path, 'test.csv'))" ] }, { "cell_type": "code", "execution_count": 294, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S " ] }, "execution_count": 294, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train.head()" ] }, { "cell_type": "code", "execution_count": 295, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([3, 1, 2], dtype=int64)" ] }, "execution_count": 295, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train['Pclass'].unique()" ] }, { "cell_type": "code", "execution_count": 296, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Does the number of family members a passenger have affect survivability? \n", "# e.g., would assistance from parents, spouse, etc. help improve your chances?\n", "\n", "def family(df):\n", " df['Family'] = df['SibSp'] + df['Parch']\n", " return None\n", "\n", "def family2(sz):\n", " # Courtesy https://www.cdc.gov/mmwr/preview/mmwrhtml/mm4847a1.htm , average family size around this time is \n", " # 3.5 children --> 5.5 \n", " if sz == 1:\n", " return 'single'\n", " if sz < 5:\n", " return 'small'\n", " if sz > 6:\n", " return 'large'\n", " return 'medium'" ] }, { "cell_type": "code", "execution_count": 297, "metadata": { "collapsed": false }, "outputs": [], "source": [ "family(df_train)\n", "family(df_test)\n", "df_train['Family'] = df_train['Family'].apply(lambda x: family2(x))\n", "df_test['Family'] = df_test['Family'].apply(lambda x: family2(x))" ] }, { "cell_type": "code", "execution_count": 298, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Pclass Family\n", "1 medium 0.500000\n", " single 0.728571\n", " small 0.584507\n", "2 medium 1.000000\n", " single 0.529412\n", " small 0.456376\n", "3 large 0.000000\n", " medium 0.137931\n", " single 0.350877\n", " small 0.242347\n", "Name: Survived, dtype: float64" ] }, "execution_count": 298, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train['Family'].value_counts()\n", "df_train.groupby(['Pclass', 'Family'])['Survived'].mean()" ] }, { "cell_type": "code", "execution_count": 299, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 299, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXEAAAD9CAYAAABKgkezAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAFO1JREFUeJzt3X9s1PXhx/HX567thP6gdgrSIdgiuBqGTWlazUo3FrRMw6qm0KK5DRXcLw9q3CgWWloFCpEVXRkgZEtmC2r5GZLvnEGENAVps3PgrCdGGEwoMygq7c22tHffPwwn5Udb8Xp37/b5SEzs5+4+n/fnPsczn3zu87mP5fP5fAIAGMkW6gEAAK4dEQcAgxFxADAYEQcAgxFxADBYRDAX5nK5grk4ABgwJk2adMXpQY24dPWBDARut1spKSmhHgauEdvPXAN92/W0A8zhFAAwGBEHAIP16XDK/fffr9jYWEnSqFGjlJ+fr2XLlslutysrK0tPPPGEvF6vysrKdOTIEUVFRWnp0qUaM2ZMvw4eAAa7XiPe3t4uSaqurvZPy83NVVVVlW6++WY9/vjjampq0qlTp9TR0aFXX31Vhw4d0ooVK7Ru3br+GzkAoPeIv//++/ryyy/16KOPqrOzU06nUx0dHRo9erQkKSsrS2+99ZbOnDmjyZMnS5JSU1P17rvv9u/IAQC9R/y6667TY489phkzZuj48eOaO3eu4uLi/I9HR0fro48+Umtrq2JiYvzT7Xa7Ojs7FRHRfRFutzuAww8vbW1tA3r9Bjq2n7kG87brNeJJSUkaM2aMLMtSUlKSYmNj9fnnn/sf93g8iouLU1tbmzwej3+61+u9LOCSBvRpQAP9NKeBju1nroG+7b7VKYZbt27VihUrJEkff/yxvvzySw0dOlT/+c9/5PP5VF9fr/T0dKWlpamurk6SdOjQIY0fPz5AwwcAXE2ve+J5eXl6+umnNWvWLFmWpeXLl8tms+l3v/udurq6lJWVpTvuuEM/+MEPtH//fhUUFMjn82n58uXfamAbNmzQgQMHZLPZZFmWnnzySU2YMOGa5rVs2TI98sgjSkxMvKbXP/nkkyooKFBmZuY1vR4A+kuvEY+KitIf/vCHy6bX1tZ2+9tms+mZZ54JyKA+/PBDvfnmm3r55ZdlWZbcbreKioq0a9eua5rfokWLAjIuAH1UNiyoiwv6gZSyL4K9xKsKy4t9EhIS1NzcrK1bt+rjjz9WSkqKtm7dKofDoaNHj0qSXn75ZVVVVenkyZOaPn26HA6HNm7cqJ/+9Ke6cLOi8vJy7d692/+6Bx98UCdPnpQkvfbaa1q6dKlaWlo0b948ORwOORwOHTlyRJK0adMm3X///Zo7d65OnDgRmjcCAHoRthFft26d3n77beXn52vatGnau3fvVZ9/5swZ/fnPf9bcuXN122236R//+Ic6OjrU2NioKVOm+J+Xl5ennTt3SpJ27NihmTNnav369brzzjtVXV2tZ599VmVlZWppadFLL72k2tparV27VufPn+/3dQaAaxH0H8DqixMnTigmJkYVFRWSpH/96196/PHHdcMNN/ifc/GtQUeNGqWoqChJ0syZM7Vjxw6dOXNGP/nJT7qdIfOzn/1Ms2bN0owZM9Ta2qrx48frgw8+0MGDB/Xaa69Jks6dO6djx47p1ltv9c9z4sSJ/b7OAHAtwnJP/MiRIyorK/NfLXrh1Mb4+HidOXNGkvTee+/5n2+zfb0ad911l9xut7Zt26a8vLxu842JidGECRNUUVGhBx98UJKUnJys2bNnq7q6Ws8//7ymT5+um2++WR9++KHa2trU1dU1aM8/BRD+wnJP/J577tHRo0c1Y8YMDR06VD6fTwsWLFBkZKSeeeYZjRw5UsOHD7/iay3LUk5Ojg4cOHDF326ZMWOG5syZ4z975le/+pUWLVqk2tpatba26oknnlBCQoLmz5+vgoICJSQkaMiQIf26vgBwrSzfxccl+pnL5eL3xBG22H4BFOSzU4IuyGen9NTOsDycAgDoGyIOAAYj4gBgMCIOAAYj4gBgsLA8xfCCWxb+X0Dnd3zFfQGdHwCEWlhHPNi4TygA03A45SJvvPGG/z6hTz31lP931AEgXBHxi7hcLu4TCsAoRPwiV7tPKACEKyJ+kZiYmD7dJxQAwgURvwj3CQVgmrDezQz2KYF33313QO8TCgD9LawjHmyBvE8oAAQDh1MAwGBEHAAMRsQBwGBEHAAMRsQBwGDhfXZKoO/TF+T74gFAf2NP/AoOHz4sh8MR6mEAQK/Ce088BDZu3Khdu3ZpyJAhoR4KAPSKPfFLjB49WlVVVaEeBgD0CRG/RE5ODj96BcAYRBwADEbEAcBg4X3cgFMCAaBH7IlfwahRo1RbWxvqYQBAr/oU8U8//VQ/+tGPdPToUZ04cUKzZs3SQw89pCVLlsjr9UqS1qxZo7y8PBUUFOidd97p10EDAL7Sa8TPnz+v0tJSXXfddZKkiooKFRYWavPmzfL5fNqzZ4+amprU2NioLVu2qLKyUuXl5f0+cABAHyK+cuVKFRQUaPjw4ZKkpqYmZWRkSJKys7N14MABuVwuZWVlybIsJSYmqqurS2fPnu3fkQMAev5ic/v27UpISNDkyZO1YcMGSZLP55NlWZKk6OhotbS0qLW1VfHx8f7XXZiekJBw2Tzdbncgxx9W2traBvT6DXRsv8BJCfUA+lk4fU56jPi2bdtkWZbeeustud1uFRUVddvD9ng8iouLu+wu8R6PR7GxsVecZ0rKwN28brd7QK/fQMf2Q18F+3Picrmu+liPh1M2bdqkmpoaVVdXKyUlRStXrlR2drYaGhokSXV1dUpPT1daWprq6+vl9XrV3Nwsr9d7xb1wAEBgfePzxIuKilRSUqLKykolJycrJydHdrtd6enpys/Pl9frVWlpaX+MFQBwiT5HvLq62v//NTU1lz3udDrldDoDMyoAQJ9wsQ8AGIyIA4DBiDgAGIyIA4DBiDgAGIyIA4DBiDgAGCy8bwoBwEi3tG0O9RD61fFQD+Ai7IkDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMEientCV1eXFi9erH//+9+y2+2qqKiQz+fTwoULZVmWxo0bpyVLlshms2nNmjXat2+fIiIiVFxcrIkTJwZjHQBg0Oo14nv37pUkvfLKK2poaPBHvLCwUJmZmSotLdWePXuUmJioxsZGbdmyRadPn5bT6dS2bdv6fQUAYDDrNeJTp07Vj3/8Y0lSc3OzbrjhBu3bt08ZGRmSpOzsbO3fv19JSUnKysqSZVlKTExUV1eXzp49q4SEhG7zc7vdgV+LMNHW1jag12+gY/uhr8Lpc9JrxCUpIiJCRUVF2r17t/74xz9q7969sixLkhQdHa2Wlha1trYqPj7e/5oL0y+NeEpKSgCHH17cbveAXr+Bju0XSMdCPYB+FezPicvluupjff5ic+XKlXr99ddVUlKi9vZ2/3SPx6O4uDjFxMTI4/F0mx4bG3uNQwYA9EWvEd+5c6defPFFSdKQIUNkWZYmTJighoYGSVJdXZ3S09OVlpam+vp6eb1eNTc3y+v1XrYXDgAIrF4Pp9xzzz16+umn9fDDD6uzs1PFxcUaO3asSkpKVFlZqeTkZOXk5Mhutys9PV35+fnyer0qLS0NxvgBYFDrNeJDhw7VCy+8cNn0mpqay6Y5nU45nc7AjAwA0Csu9gEAgxFxADAYEQcAgxFxADAYEQcAgxFxADAYEQcAgxFxADAYEQcAgxFxADAYEQcAgxFxADAYEQcAgxFxADAYEQcAgxFxADAYEQcAgxFxADAYEQcAgxFxADAYEQcAgxFxADAYEQcAgxFxADAYEQcAgxFxADAYEQcAgxFxADAYEQcAgxFxADBYRKgH0K/KhgV1cSlBXZqksi+CvUQAYYY9cQAwGBEHAIMRcQAwWI/HxM+fP6/i4mKdOnVKHR0d+vWvf61bb71VCxculGVZGjdunJYsWSKbzaY1a9Zo3759ioiIUHFxsSZOnBisdQCAQavHiO/atUvx8fF67rnn9Nlnn+mBBx7Q97//fRUWFiozM1OlpaXas2ePEhMT1djYqC1btuj06dNyOp3atm1bsNYBAAatHiM+bdo05eTk+P+22+1qampSRkaGJCk7O1v79+9XUlKSsrKyZFmWEhMT1dXVpbNnzyohIeGyebrd7gCvwtUF/WyRIAvmezkYtLW18Z6iT8Lpc9JjxKOjoyVJra2tmjdvngoLC7Vy5UpZluV/vKWlRa2trYqPj+/2upaWlitGPCVloKc1eHgvA8vtdvOeBsyxUA+gXwX7c+Jyua76WK9fbJ4+fVo///nPlZubq+nTp8tm+/olHo9HcXFxiomJkcfj6TY9Njb2Ww4bANCbHiP+ySef6NFHH9Xvf/975eXlSZJuv/12NTQ0SJLq6uqUnp6utLQ01dfXy+v1qrm5WV6v94p74QCAwOrxcMr69et17tw5rV27VmvXrpUkLVq0SEuXLlVlZaWSk5OVk5Mju92u9PR05efny+v1qrS0NCiDB4DBzvL5fL5gLczlcmnSpEnBWlzQL7sPOi67DyiOiQfOLQv/L9RD6FfHV9wX1OX11E4u9gEAgw3sH8CC2fgBM6BX7IkDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYjIgDgMGIOAAYrE8RP3z4sBwOhyTpxIkTmjVrlh566CEtWbJEXq9XkrRmzRrl5eWpoKBA77zzTv+NGADg12vEN27cqMWLF6u9vV2SVFFRocLCQm3evFk+n0979uxRU1OTGhsbtWXLFlVWVqq8vLzfBw4AkCJ6e8Lo0aNVVVWlBQsWSJKampqUkZEhScrOztb+/fuVlJSkrKwsWZalxMREdXV16ezZs0pISLhsfm63O8CrcHUpQVtSaATzvQwFth/CVThtu14jnpOTo5MnT/r/9vl8sixLkhQdHa2Wlha1trYqPj7e/5wL068U8ZSUgf5PM3h4L802sLffsVAPoF8Fe9u5XK6rPtZrxC9ls319BMbj8SguLk4xMTHyeDzdpsfGxn7TWQPd3NK2OdRD6FfHQz0ADAjf+OyU22+/XQ0NDZKkuro6paenKy0tTfX19fJ6vWpubpbX673iXjgAILC+8Z54UVGRSkpKVFlZqeTkZOXk5Mhutys9PV35+fnyer0qLS3tj7ECAC7Rp4iPGjVKtbW1kqSkpCTV1NRc9hyn0ymn0xnY0QEAesTFPgBgMCIOAAYj4gBgsG/8xaZJOEUNwEDHnjgAGIyIA4DBiDgAGIyIA4DBiDgAGIyIA4DBiDgAGIyIA4DBiDgAGIyIA4DBiDgAGIyIA4DBiDgAGIyIA4DBiDgAGIyIA4DBiDgAGIyIA4DBiDgAGIyIA4DBiDgAGIyIA4DBiDgAGIyIA4DBiDgAGIyIA4DBiDgAGIyIA4DBiDgAGIyIA4DBIgI5M6/Xq7KyMh05ckRRUVFaunSpxowZE8hFAAAuEtA98TfeeEMdHR169dVX9dRTT2nFihWBnD0A4BIBjbjL5dLkyZMlSampqXr33XcDOXsAwCUCejiltbVVMTEx/r/tdrs6OzsVEfH1YlwuVyAX2aNtM24K2rJCIZjvZSiw/czFtguegEY8JiZGHo/H/7fX6+0W8EmTJgVycQAw6AX0cEpaWprq6uokSYcOHdL48eMDOXsAwCUsn8/nC9TMLpyd8sEHH8jn82n58uUaO3ZsoGYPALhEQCMOAAguLvYBLtLR0RHqIeAbamtrG9TbjYhjUHrzzTc1ZcoU3X333frb3/7mnz5nzpwQjgp98dFHH+k3v/mNSktLdeDAAd1777269957tXfv3lAPLSQCenYKYIr169drx44d8vl8mj9/vtrb2/XAAw+Io4vhr7i4WE6nU6dOndK8efP0+uuv6zvf+Y7mzJmjKVOmhHp4QUfEvwWHw6Hz5893m+bz+WRZll555ZUQjQp9ERkZqfj4eEnS2rVr9Ytf/EIjR46UZVkhHhl609nZqYyMDElSQ0ODvvvd70pSt9OZBxO+2PwWDh8+rMWLF+tPf/qT7HZ7t8e+973vhWhU6IsFCxbo+uuv1/z58zV06FCdPn1ajz32mM6dO6f6+vpQDw89KC4ulmVZevbZZ2WzfXVEeMOGDXrvvff0/PPPh3h0wWcvKysrC/UgTHXTTTfpf//7nzo7O5Wamqq4uDj/fwhvU6ZM0aeffqpx48YpMjJSsbGxysnJ0RdffKHs7OxQDw89uHDI5OLTl0+ePKlf/vKXioyMDNWwQoY9cQAwGGenAIDBiDgAGIyIY8BpaGjQXXfdJYfDIYfDoZkzZ6q6uvqKz3U4HDp69GiQRwgEzuA8JwcD3p133qnVq1dL+uoqzGnTpik3N5cvnTHgEHEMeK2trbLZbHr//fe1atUq+Xw+jRgxQqtWrfI/57///a/KysrU3t6uzz//XL/97W81depUrV69WgcPHpTX69V9992n2bNna9OmTdq5c6dsNpvS0tJUVFQUwrXDYEfEMSAdPHhQDodDlmUpMjJSJSUlWrp0qVavXq2xY8dq06ZN3Q6jHDt2TI888ogyMzP19ttvq6qqSlOnTtXOnTtVU1OjESNGaPv27ZKk7du3q6SkRKmpqdq8efNlNz4BgolPHgakiw+nXFBcXOw/t/jhhx/u9tiNN96odevWaevWrbIsS52dnZKkyspKVVZW6pNPPvHferCiokJ/+ctftGrVKqWmpnKpPkKKLzYxaAwfPlzHjx+X9NUVfrt37/Y/9sILLyg3N1fPPfecMjMz5fP51NHRob///e+qrKzUX//6V+3YsUOnTp1SbW2tysvLVVNTI7fbrX/+858hWiOAPXEMIuXl5SouLpbNZtONN96o2bNn66WXXpIkTZs2TcuWLdOLL76okSNH6rPPPlNUVJSGDRum3NxcDRs2TD/84Q+VmJio2267TXl5ebr++us1YsQI3XHHHSFeMwxmXLEJAAbjcAoAGIyIA4DBiDgAGIyIA4DBiDgAGIyIA4DBiDgAGOz/Af4qNSR1Voj0AAAAAElFTkSuQmCC\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.figure()\n", "df_train.pivot_table('PassengerId', 'Pclass', 'Survived', 'count').plot(kind='bar', stacked=True)" ] }, { "cell_type": "code", "execution_count": 300, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 Braund, Mr. Owen Harris\n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th...\n", "2 Heikkinen, Miss. Laina\n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel)\n", "4 Allen, Mr. William Henry\n", "Name: Name, dtype: object" ] }, "execution_count": 300, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train['Name'].head()" ] }, { "cell_type": "code", "execution_count": 301, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',\n", " 'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'Countess',\n", " 'Jonkheer'], dtype=object)" ] }, "execution_count": 301, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "df_train['Title'] = df_train['Name'].apply(lambda x: (re.search(' ([a-zA-Z]+)\\.', x)).group(1))\n", "df_test['Title'] = df_test['Name'].apply(lambda x: (re.search(' ([a-zA-Z]+)\\.', x)).group(1))\n", "df_train['Title'].unique()" ] }, { "cell_type": "code", "execution_count": 302, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Have a look at rough social classes\n", "soclasses = {\n", " 'upper': ('Sir', 'Lady', 'Don', 'Countess', 'Jonkheer'), \n", " 'pro': ('Dr', 'Rev', 'Major', 'Col', 'Capt'), \n", " 'middle': ('Mr', 'Miss', 'Mrs', 'Master', 'Mlle', 'Ms', 'Mme')\n", "}\n", "for soclass in soclasses:\n", " df_train.loc[df_train['Title'].isin(soclasses[soclass]), 'SocialClass'] = soclass\n", " df_test.loc[df_test['Title'].isin(soclasses[soclass]), 'SocialClass'] = soclass\n", " \n", "df_train['SocialClass'].fillna('U', inplace=True)\n", "df_test['SocialClass'].fillna('U', inplace=True)" ] }, { "cell_type": "code", "execution_count": 303, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Fill in any missing ages with the median for the passenger class, sex, and social class\n", "def impute_age(df):\n", " df['Age'] = df.groupby(['Sex', 'Pclass', 'SocialClass'])['Age'].apply(lambda x: x.fillna(x.median()))\n", "\n", "impute_age(df_train)\n", "impute_age(df_test)" ] }, { "cell_type": "code", "execution_count": 304, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Fill in any missing ticket fares with the median for passenger class, sex, and age\n", "def impute_fare(df):\n", " df['Fare'] = df.groupby(['Sex', 'Pclass', 'Age'])['Fare'].apply(lambda x: x.fillna(x.median()))\n", "\n", "impute_fare(df_train)\n", "impute_fare(df_test)" ] }, { "cell_type": "code", "execution_count": 305, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassAgeSibSpParchFare
CabinNo
0442.5035970.3007192.63597127.1456830.5424460.36259019.110155
1377.2222220.7037041.55555627.3148150.4814810.59259362.023144
2560.6111110.8333331.00000042.7777780.4444440.11111150.349089
3466.4000000.5600001.00000037.7968000.6400000.92000096.852500
4396.3571430.7500001.17857136.3035710.3214290.32142959.593307
5563.4285710.6666671.00000039.5714290.3809520.33333365.937895
6410.0000000.7272731.09090932.1363640.3636360.636364156.446964
7549.0000000.6363641.00000033.2727270.7272730.818182104.447718
8437.6000000.7000001.10000036.7000000.5000000.00000088.854160
9345.8888890.3333331.00000039.0000000.4444440.22222287.845833
10483.7500000.7500001.00000033.7916670.6666670.750000112.423617
11414.4000000.8000001.30000040.9500000.2000000.00000073.920840
12363.5000000.0000001.00000033.5000000.0000000.50000028.725000
13488.7272730.6363641.36363637.1363640.3636360.27272757.465909
15890.0000001.0000001.00000026.0000000.0000000.00000030.000000
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass Age SibSp Parch \\\n", "CabinNo \n", "0 442.503597 0.300719 2.635971 27.145683 0.542446 0.362590 \n", "1 377.222222 0.703704 1.555556 27.314815 0.481481 0.592593 \n", "2 560.611111 0.833333 1.000000 42.777778 0.444444 0.111111 \n", "3 466.400000 0.560000 1.000000 37.796800 0.640000 0.920000 \n", "4 396.357143 0.750000 1.178571 36.303571 0.321429 0.321429 \n", "5 563.428571 0.666667 1.000000 39.571429 0.380952 0.333333 \n", "6 410.000000 0.727273 1.090909 32.136364 0.363636 0.636364 \n", "7 549.000000 0.636364 1.000000 33.272727 0.727273 0.818182 \n", "8 437.600000 0.700000 1.100000 36.700000 0.500000 0.000000 \n", "9 345.888889 0.333333 1.000000 39.000000 0.444444 0.222222 \n", "10 483.750000 0.750000 1.000000 33.791667 0.666667 0.750000 \n", "11 414.400000 0.800000 1.300000 40.950000 0.200000 0.000000 \n", "12 363.500000 0.000000 1.000000 33.500000 0.000000 0.500000 \n", "13 488.727273 0.636364 1.363636 37.136364 0.363636 0.272727 \n", "15 890.000000 1.000000 1.000000 26.000000 0.000000 0.000000 \n", "\n", " Fare \n", "CabinNo \n", "0 19.110155 \n", "1 62.023144 \n", "2 50.349089 \n", "3 96.852500 \n", "4 59.593307 \n", "5 65.937895 \n", "6 156.446964 \n", "7 104.447718 \n", "8 88.854160 \n", "9 87.845833 \n", "10 112.423617 \n", "11 73.920840 \n", "12 28.725000 \n", "13 57.465909 \n", "15 30.000000 " ] }, "execution_count": 305, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Fill in missing Embarked info with 'U' (unknown)\n", "df_train['Embarked'].fillna('U', inplace=True)\n", "df_test['Embarked'].fillna('U', inplace=True)\n", "\n", "# Ditto Title...\n", "df_train['Title'].fillna('U', inplace=True)\n", "df_test['Title'].fillna('U', inplace=True)\n", "\n", "# ... and cabin info \n", "df_train['Cabin'].fillna('U0', inplace=True)\n", "df_test['Cabin'].fillna('U0', inplace=True)\n", "# Add a deck feature\n", "df_train['CabinDeck'] = df_train['Cabin'].map(lambda x: x[0])\n", "df_test['CabinDeck'] = df_test['Cabin'].map(lambda x: x[0])\n", "\n", "# Deck layouts look like cabin numbers might be interesting (low numbers are aft)\n", "def cabin_num(x):\n", " toks = x.split()\n", " try:\n", " val = int(toks[0][1:])\n", " except Exception as err:\n", " val = 0\n", " finally:\n", " return val\n", " \n", "# Try grouping in groups of 10\n", "def grpcabin(x):\n", " grp = 0\n", " for i in range(0, 1000, 10):\n", " if x > i:\n", " grp += 1\n", " else:\n", " return grp\n", " \n", "df_train['CabinNo'] = df_train['Cabin'].map(lambda x: cabin_num(x)).apply(grpcabin)\n", "df_test['CabinNo'] = df_test['Cabin'].map(lambda x: cabin_num(x)).apply(grpcabin)\n", "df_train.groupby(['CabinNo']).mean()" ] }, { "cell_type": "code", "execution_count": 306, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassAgeSibSpParchFareCabinNo
PassengerId1.000000-0.005007-0.0351440.040091-0.057527-0.0016520.0126580.029668
Survived-0.0050071.000000-0.338481-0.059662-0.0353220.0816290.2573070.243628
Pclass-0.035144-0.3384811.000000-0.4113960.0830810.018443-0.549500-0.603300
Age0.040091-0.059662-0.4113961.000000-0.249289-0.1749040.1220790.240026
SibSp-0.057527-0.0353220.083081-0.2492891.0000000.4148380.159651-0.035054
Parch-0.0016520.0816290.018443-0.1749040.4148381.0000000.2162250.007560
Fare0.0126580.257307-0.5495000.1220790.1596510.2162251.0000000.427849
CabinNo0.0296680.243628-0.6033000.240026-0.0350540.0075600.4278491.000000
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass Age SibSp Parch \\\n", "PassengerId 1.000000 -0.005007 -0.035144 0.040091 -0.057527 -0.001652 \n", "Survived -0.005007 1.000000 -0.338481 -0.059662 -0.035322 0.081629 \n", "Pclass -0.035144 -0.338481 1.000000 -0.411396 0.083081 0.018443 \n", "Age 0.040091 -0.059662 -0.411396 1.000000 -0.249289 -0.174904 \n", "SibSp -0.057527 -0.035322 0.083081 -0.249289 1.000000 0.414838 \n", "Parch -0.001652 0.081629 0.018443 -0.174904 0.414838 1.000000 \n", "Fare 0.012658 0.257307 -0.549500 0.122079 0.159651 0.216225 \n", "CabinNo 0.029668 0.243628 -0.603300 0.240026 -0.035054 0.007560 \n", "\n", " Fare CabinNo \n", "PassengerId 0.012658 0.029668 \n", "Survived 0.257307 0.243628 \n", "Pclass -0.549500 -0.603300 \n", "Age 0.122079 0.240026 \n", "SibSp 0.159651 -0.035054 \n", "Parch 0.216225 0.007560 \n", "Fare 1.000000 0.427849 \n", "CabinNo 0.427849 1.000000 " ] }, "execution_count": 306, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train.corr()" ] }, { "cell_type": "code", "execution_count": 307, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 307, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#print(df_train.columns)\n", "# So far the most interesting attributes for Survival are Fare, CabinNo, and Pclass\n", "features = [\"Fare\", \"CabinNo\", \"Pclass\"]\n", "plt.figure()\n", "sns.heatmap(df_train.corr(), center=0)" ] }, { "cell_type": "code", "execution_count": 308, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "PassengerId int64\n", "Survived int64\n", "Pclass int64\n", "Name object\n", "Sex object\n", "Age float64\n", "SibSp int64\n", "Parch int64\n", "Ticket object\n", "Fare float64\n", "Cabin object\n", "Embarked object\n", "Family object\n", "Title object\n", "SocialClass object\n", "CabinDeck object\n", "CabinNo int64\n", "dtype: object" ] }, "execution_count": 308, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Closer look at the columns\n", "df_train.dtypes" ] }, { "cell_type": "code", "execution_count": 309, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Ticket prefix might be interesting, ticket number maybe not so much\n", "tmp = df_train['Ticket'].str.split()\n", "df_train['TicketPrefix'] = tmp.apply(lambda x: x[0] + x[1] if len(x) == 3 else (x[0] if len(x) == 2 else 'U'))\n", "tmp = df_test['Ticket'].str.split()\n", "df_test['TicketPrefix'] = tmp.apply(lambda x: x[0] + x[1] if len(x) == 3 else (x[0] if len(x) == 2 else 'U'))" ] }, { "cell_type": "code", "execution_count": 310, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.preprocessing import LabelEncoder, LabelBinarizer\n", "\n", "def onehot_column(df, col):\n", " enc = LabelBinarizer()\n", " arr = enc.fit_transform(df[col])\n", " df[col] = arr\n", " \n", "\n", "def encode_column(df, col):\n", " enc = LabelEncoder()\n", " arr = enc.fit_transform(df[col])\n", " df[col] = arr" ] }, { "cell_type": "code", "execution_count": 311, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# One hot encode categorical columns w/o ordinality\n", "for col in ['Sex', 'TicketPrefix', 'Embarked', 'Title']:\n", " onehot_column(df_train, col)\n", " onehot_column(df_test, col)\n", " \n", "# Label encode categorical columns with ordinality\n", "for col in ['CabinDeck', 'CabinNo', 'Family', 'SocialClass']:\n", " encode_column(df_train, col)\n", " encode_column(df_test, col)" ] }, { "cell_type": "code", "execution_count": 312, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassSexAgeSibSpParchFareEmbarkedFamilyTitleSocialClassCabinDeckCabinNoTicketPrefix
PassengerId1.000000-0.005007-0.0351440.0429390.040091-0.057527-0.0016520.012658-0.0012050.0692800.0390970.061248-0.0309390.029036-0.024342
Survived-0.0050071.000000-0.338481-0.543351-0.059662-0.0353220.0816290.2573070.168240-0.000495-0.0264560.002855-0.3011160.243581-0.037436
Pclass-0.035144-0.3384811.0000000.131900-0.4113960.0830810.018443-0.549500-0.2432920.014747-0.052496-0.1884780.746616-0.6038090.039243
Sex0.042939-0.5433510.1319001.0000000.101603-0.114631-0.245489-0.182333-0.0828530.1519120.0247280.0560260.123076-0.0991870.034990
Age0.040091-0.059662-0.4113960.1016031.000000-0.249289-0.1749040.1220790.0426760.1231100.1032080.184765-0.2928540.240570-0.021745
SibSp-0.057527-0.0353220.083081-0.114631-0.2492891.0000000.4148380.159651-0.059528-0.7172400.014507-0.0231710.041540-0.034931-0.022508
Parch-0.0016520.0816290.018443-0.245489-0.1749040.4148381.0000000.216225-0.011069-0.4275660.025731-0.059277-0.0325480.007763-0.022467
Fare0.0126580.257307-0.549500-0.1823330.1220790.1596510.2162251.0000000.269335-0.2089330.0261840.018150-0.5230130.428670-0.023068
Embarked-0.0012050.168240-0.243292-0.0828530.042676-0.059528-0.0110690.2693351.000000-0.002195-0.0161580.052302-0.2430170.159167-0.022864
Family0.069280-0.0004950.0147470.1519120.123110-0.717240-0.427566-0.208933-0.0021951.0000000.0164830.0217720.058118-0.0443990.023323
Title0.039097-0.026456-0.0524960.0247280.1032080.0145070.0257310.026184-0.0161580.0164831.0000000.159064-0.0779070.021410-0.001590
SocialClass0.0612480.002855-0.1884780.0560260.184765-0.023171-0.0592770.0181500.0523020.0217720.1590641.000000-0.1253160.051488-0.007303
CabinDeck-0.030939-0.3011160.7466160.123076-0.2928540.041540-0.032548-0.523013-0.2430170.058118-0.077907-0.1253161.000000-0.7875900.024762
CabinNo0.0290360.243581-0.603809-0.0991870.240570-0.0349310.0077630.4286700.159167-0.0443990.0214100.051488-0.7875901.000000-0.020222
TicketPrefix-0.024342-0.0374360.0392430.034990-0.021745-0.022508-0.022467-0.023068-0.0228640.023323-0.001590-0.0073030.024762-0.0202221.000000
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass Sex Age SibSp \\\n", "PassengerId 1.000000 -0.005007 -0.035144 0.042939 0.040091 -0.057527 \n", "Survived -0.005007 1.000000 -0.338481 -0.543351 -0.059662 -0.035322 \n", "Pclass -0.035144 -0.338481 1.000000 0.131900 -0.411396 0.083081 \n", "Sex 0.042939 -0.543351 0.131900 1.000000 0.101603 -0.114631 \n", "Age 0.040091 -0.059662 -0.411396 0.101603 1.000000 -0.249289 \n", "SibSp -0.057527 -0.035322 0.083081 -0.114631 -0.249289 1.000000 \n", "Parch -0.001652 0.081629 0.018443 -0.245489 -0.174904 0.414838 \n", "Fare 0.012658 0.257307 -0.549500 -0.182333 0.122079 0.159651 \n", "Embarked -0.001205 0.168240 -0.243292 -0.082853 0.042676 -0.059528 \n", "Family 0.069280 -0.000495 0.014747 0.151912 0.123110 -0.717240 \n", "Title 0.039097 -0.026456 -0.052496 0.024728 0.103208 0.014507 \n", "SocialClass 0.061248 0.002855 -0.188478 0.056026 0.184765 -0.023171 \n", "CabinDeck -0.030939 -0.301116 0.746616 0.123076 -0.292854 0.041540 \n", "CabinNo 0.029036 0.243581 -0.603809 -0.099187 0.240570 -0.034931 \n", "TicketPrefix -0.024342 -0.037436 0.039243 0.034990 -0.021745 -0.022508 \n", "\n", " Parch Fare Embarked Family Title SocialClass \\\n", "PassengerId -0.001652 0.012658 -0.001205 0.069280 0.039097 0.061248 \n", "Survived 0.081629 0.257307 0.168240 -0.000495 -0.026456 0.002855 \n", "Pclass 0.018443 -0.549500 -0.243292 0.014747 -0.052496 -0.188478 \n", "Sex -0.245489 -0.182333 -0.082853 0.151912 0.024728 0.056026 \n", "Age -0.174904 0.122079 0.042676 0.123110 0.103208 0.184765 \n", "SibSp 0.414838 0.159651 -0.059528 -0.717240 0.014507 -0.023171 \n", "Parch 1.000000 0.216225 -0.011069 -0.427566 0.025731 -0.059277 \n", "Fare 0.216225 1.000000 0.269335 -0.208933 0.026184 0.018150 \n", "Embarked -0.011069 0.269335 1.000000 -0.002195 -0.016158 0.052302 \n", "Family -0.427566 -0.208933 -0.002195 1.000000 0.016483 0.021772 \n", "Title 0.025731 0.026184 -0.016158 0.016483 1.000000 0.159064 \n", "SocialClass -0.059277 0.018150 0.052302 0.021772 0.159064 1.000000 \n", "CabinDeck -0.032548 -0.523013 -0.243017 0.058118 -0.077907 -0.125316 \n", "CabinNo 0.007763 0.428670 0.159167 -0.044399 0.021410 0.051488 \n", "TicketPrefix -0.022467 -0.023068 -0.022864 0.023323 -0.001590 -0.007303 \n", "\n", " CabinDeck CabinNo TicketPrefix \n", "PassengerId -0.030939 0.029036 -0.024342 \n", "Survived -0.301116 0.243581 -0.037436 \n", "Pclass 0.746616 -0.603809 0.039243 \n", "Sex 0.123076 -0.099187 0.034990 \n", "Age -0.292854 0.240570 -0.021745 \n", "SibSp 0.041540 -0.034931 -0.022508 \n", "Parch -0.032548 0.007763 -0.022467 \n", "Fare -0.523013 0.428670 -0.023068 \n", "Embarked -0.243017 0.159167 -0.022864 \n", "Family 0.058118 -0.044399 0.023323 \n", "Title -0.077907 0.021410 -0.001590 \n", "SocialClass -0.125316 0.051488 -0.007303 \n", "CabinDeck 1.000000 -0.787590 0.024762 \n", "CabinNo -0.787590 1.000000 -0.020222 \n", "TicketPrefix 0.024762 -0.020222 1.000000 " ] }, "execution_count": 312, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# The originally interesting features were Fare, CabinNo, and Pclass - where are we now?\n", "plt.figure()\n", "sns.heatmap(df_train.corr(), center=0)\n", "df_train.corr()" ] }, { "cell_type": "code", "execution_count": 313, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Looks like Sex, Embarked, and CabinDeck can be added; maybe Age, SibSp, Parch, Title, and TicketPrefix\n", "features = ['Fare', 'CabinNo', 'Pclass', 'Sex', 'Embarked', 'CabinDeck', 'Age', 'SibSp', 'Parch', 'Title', 'TicketPrefix']" ] }, { "cell_type": "code", "execution_count": 314, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
FareCabinNoPclassSexEmbarkedCabinDeckAgeSibSpParchTitleTicketPrefix
count891.000000891.000000891.000000891.000000891.000000891.000000891.000000891.000000891.000000891.000000891.000000
mean32.2042081.2008982.3086420.6475870.1885526.71604929.0911000.5230080.3815940.0011220.002245
std49.6934292.8182870.8360710.4779900.3913722.46073913.2939421.1027430.8060570.0335010.047351
min0.0000000.0000001.0000000.0000000.0000000.0000000.4200000.0000000.0000000.0000000.000000
25%7.9104000.0000002.0000000.0000000.0000008.00000021.5000000.0000000.0000000.0000000.000000
50%14.4542000.0000003.0000001.0000000.0000008.00000026.0000000.0000000.0000000.0000000.000000
75%31.0000000.0000003.0000001.0000000.0000008.00000036.0000001.0000000.0000000.0000000.000000
max512.32920014.0000003.0000001.0000001.0000008.00000080.0000008.0000006.0000001.0000001.000000
\n", "
" ], "text/plain": [ " Fare CabinNo Pclass Sex Embarked CabinDeck \\\n", "count 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 \n", "mean 32.204208 1.200898 2.308642 0.647587 0.188552 6.716049 \n", "std 49.693429 2.818287 0.836071 0.477990 0.391372 2.460739 \n", "min 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 \n", "25% 7.910400 0.000000 2.000000 0.000000 0.000000 8.000000 \n", "50% 14.454200 0.000000 3.000000 1.000000 0.000000 8.000000 \n", "75% 31.000000 0.000000 3.000000 1.000000 0.000000 8.000000 \n", "max 512.329200 14.000000 3.000000 1.000000 1.000000 8.000000 \n", "\n", " Age SibSp Parch Title TicketPrefix \n", "count 891.000000 891.000000 891.000000 891.000000 891.000000 \n", "mean 29.091100 0.523008 0.381594 0.001122 0.002245 \n", "std 13.293942 1.102743 0.806057 0.033501 0.047351 \n", "min 0.420000 0.000000 0.000000 0.000000 0.000000 \n", "25% 21.500000 0.000000 0.000000 0.000000 0.000000 \n", "50% 26.000000 0.000000 0.000000 0.000000 0.000000 \n", "75% 36.000000 1.000000 0.000000 0.000000 0.000000 \n", "max 80.000000 8.000000 6.000000 1.000000 1.000000 " ] }, "execution_count": 314, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train[features].describe()" ] }, { "cell_type": "code", "execution_count": 315, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 549\n", "1 342\n", "Name: Survived, dtype: int64" ] }, "execution_count": 315, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Let's scale Fare \n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.preprocessing import Imputer\n", "\n", "def scale_col(df, col):\n", " imp = Imputer(missing_values='NaN', strategy='mean', axis=0)\n", " df[col] = imp.fit_transform(df[[col]])\n", " scaler = StandardScaler()\n", " df[col] = scaler.fit_transform(df[[col]])\n", "\n", "scale_col(df_train, 'Fare')\n", "scale_col(df_test, 'Fare')\n", "df_train[features].describe()\n", "# How are we doing for class balance?\n", "df_train['Survived'].value_counts()" ] }, { "cell_type": "code", "execution_count": 316, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{0: 0.38383838383838387, 1: 0.6161616161616161}" ] }, "execution_count": 316, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Not bad but let's try balancing\n", "classes = df_train['Survived'].unique().tolist()\n", "weights = {\n", " k: 1 - len(df_train[df_train['Survived'] == k]) / len(df_train['Survived']) for k in classes\n", "}\n", "weights" ] }, { "cell_type": "code", "execution_count": 317, "metadata": {}, "outputs": [], "source": [ "# Try adding Euclidean distances from centroids \n", "from sklearn.cluster import KMeans\n", "train = df_train[features].copy()\n", "kmeans = KMeans().fit(train)\n", "\n", "def add_kmeans_features(df):\n", " train_dists = kmeans.transform(df[features])\n", " for col in range(train_dists.shape[1]):\n", " colname = 'dist{0}'.format(col)\n", " df.loc[:, colname] = train_dists[:,col]\n", " scale_col(df, colname)\n", " \n", "add_kmeans_features(df_train)\n", "add_kmeans_features(df_test)\n", "features.extend([col for col in train if col.startswith('dist')])" ] }, { "cell_type": "code", "execution_count": 318, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp Parch \\\n", "0 Braund, Mr. Owen Harris 1 22.0 1 0 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 38.0 1 0 \n", "2 Heikkinen, Miss. Laina 0 26.0 0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 35.0 1 0 \n", "4 Allen, Mr. William Henry 1 35.0 0 0 \n", "\n", " Ticket Fare ... CabinNo TicketPrefix dist0 \\\n", "0 A/5 21171 -0.502445 ... 0 0 -0.598335 \n", "1 PC 17599 0.786845 ... 9 0 0.766014 \n", "2 STON/O2. 3101282 -0.488854 ... 0 0 -0.282673 \n", "3 113803 0.420730 ... 13 0 0.642272 \n", "4 373450 -0.486337 ... 0 0 0.415993 \n", "\n", " dist1 dist2 dist3 dist4 dist5 dist6 dist7 \n", "0 -0.925502 0.514910 -0.270365 0.535674 0.113658 -0.891430 0.326868 \n", "1 0.651874 -0.882697 0.118371 -0.687792 -1.746630 0.748394 -0.605323 \n", "2 -0.891656 0.140627 -0.756014 0.218261 -0.313025 -0.536712 -0.136768 \n", "3 0.676027 -0.454536 0.367183 -0.367780 -1.428689 0.677730 -0.109918 \n", "4 -0.004193 -0.691096 -0.969891 -0.494080 -0.827510 0.281679 -1.175283 \n", "\n", "[5 rows x 26 columns]\n" ] } ], "source": [ "print(df_train.head())" ] }, { "cell_type": "code", "execution_count": 319, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting 5 folds for each of 15 candidates, totalling 75 fits\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[Parallel(n_jobs=7)]: Done 36 tasks | elapsed: 16.5s\n", "[Parallel(n_jobs=7)]: Done 75 out of 75 | elapsed: 1.0min finished\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Metrics: Best score = 0.8013468013468014 using params = {'C': 10, 'kernel': 'rbf'}\n", "\n", "Fitting 5 folds for each of 1470 candidates, totalling 7350 fits\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[Parallel(n_jobs=7)]: Done 36 tasks | elapsed: 8.3s\n", "[Parallel(n_jobs=7)]: Done 186 tasks | elapsed: 20.9s\n", "[Parallel(n_jobs=7)]: Done 436 tasks | elapsed: 42.6s\n", "[Parallel(n_jobs=7)]: Done 786 tasks | elapsed: 1.2min\n", "[Parallel(n_jobs=7)]: Done 1236 tasks | elapsed: 1.9min\n", "[Parallel(n_jobs=7)]: Done 1786 tasks | elapsed: 2.7min\n", "[Parallel(n_jobs=7)]: Done 2436 tasks | elapsed: 3.6min\n", "[Parallel(n_jobs=7)]: Done 3186 tasks | elapsed: 4.9min\n", "[Parallel(n_jobs=7)]: Done 4036 tasks | elapsed: 6.4min\n", "[Parallel(n_jobs=7)]: Done 4986 tasks | elapsed: 8.0min\n", "[Parallel(n_jobs=7)]: Done 6036 tasks | elapsed: 9.5min\n", "[Parallel(n_jobs=7)]: Done 7186 tasks | elapsed: 11.2min\n", "[Parallel(n_jobs=7)]: Done 7350 out of 7350 | elapsed: 11.4min finished\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Metrics: Best score = 0.8103254769921436 using params = {'min_samples_split': 0.1, 'class_weight': None, 'n_jobs': -1, 'verbose': True, 'criterion': 'entropy', 'max_features': None, 'oob_score': True, 'n_estimators': 50}\n", "Model and metadata saved to 'titanic_rf_1522592121.pkl'\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 0.0s\n", "[Parallel(n_jobs=-1)]: Done 50 out of 50 | elapsed: 0.0s finished\n" ] } ], "source": [ "from sklearn.externals import joblib\n", "import sklearn\n", "import multiprocessing as mp\n", "import sys\n", "import time\n", "import platform\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.svm import SVC\n", "from sklearn.ensemble import RandomForestClassifier\n", "\n", "\n", "def persist_model(mdl, basename, **kwargs):\n", " version = sys.version_info\n", " metadata = {\n", " 'versions': {\n", " 'python': '{0}.{1}.{2}'.format(version.major, version.minor, version.micro),\n", " 'pandas': pd.__version__,\n", " 'sklearn': sklearn.__version__\n", " }\n", " }\n", " if kwargs is not None:\n", " metadata.update(**kwargs)\n", " outname = '{0}_{1}.pkl'.format(basename, int(time.time()))\n", " bndl = {\n", " 'model': mdl,\n", " 'metadata': metadata\n", " }\n", " joblib.dump(clf, outname)\n", " return outname\n", "\n", "\n", "def grid_search(clz, X, y, parameters=None):\n", " clf = GridSearchCV(estimator=clz, param_grid=parameters, verbose=1, n_jobs=mp.cpu_count() - 1, cv=5)\n", " clf.fit(X, y)\n", " return clf\n", "\n", "\n", "# We'll look at an SVC first, then use it as a benchmark of sorts for the Random Forest model.\n", "\n", "def find_svc(X, y):\n", " parameters = {'kernel':('linear', 'rbf', 'sigmoid'), 'C': [1, 10, 0.5, 0.75, 0.25]}\n", " return grid_search(SVC(), X, y, parameters)\n", "\n", "train = df_train.copy()\n", "clf = find_svc(train[features], train['Survived'])\n", "print('Metrics: Best score = {0} using params = {1}\\n'.format(clf.best_score_, clf.best_params_))\n", "\n", "\n", "def find_rf(X, y):\n", " parameters = {\n", " 'n_estimators': [10, 20, 50, 64, 100, 128, 256], \n", " 'criterion': ['gini', 'entropy'],\n", " 'max_features': ['sqrt', 'log2', None, 0.1, 0.25, 0.5, 0.75],\n", " 'min_samples_split': [0.1, 0.25, 0.5, 0.75, 1.0],\n", " 'verbose': [True],\n", " 'oob_score': [True],\n", " 'class_weight': [weights, \"balanced_subsample\", None],\n", " 'n_jobs': [-1]\n", " }\n", " return grid_search(RandomForestClassifier(), X, y, parameters)\n", "\n", "train = df_train.copy()\n", "clf2 = find_rf(train[features], train['Survived'])\n", "print('Metrics: Best score = {0} using params = {1}'.format(clf2.best_score_, clf2.best_params_))\n", "\n", " \n", "saved_model = persist_model(clf2, 'titanic_rf', features=features)\n", "print(\"Model and metadata saved to '{0}'\".format(saved_model))" ] }, { "cell_type": "code", "execution_count": 320, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[Parallel(n_jobs=8)]: Done 34 tasks | elapsed: 0.0s\n", "[Parallel(n_jobs=8)]: Done 50 out of 50 | elapsed: 0.0s finished\n" ] } ], "source": [ "# Ok let's do this\n", "submission_df = pd.DataFrame(\n", " {\n", " 'PassengerId': df_test['PassengerId'],\n", " 'Survived': clf2.predict(df_test[features])\n", " }\n", ")" ] }, { "cell_type": "code", "execution_count": 321, "metadata": {}, "outputs": [], "source": [ "submission = submission_df.to_csv(\n", " 'titanic.csv',\n", " index=False\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.4" } }, "nbformat": 4, "nbformat_minor": 2 }