{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"KaggleTitanicKoban.ipynb","provenance":[]},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ohs-Fg2Y0vls","executionInfo":{"status":"ok","timestamp":1613092594716,"user_tz":300,"elapsed":56292,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"eae4ad35-236d-4ea4-8d9f-cdf08070e98b"},"source":["# Mount data drive\n","from google.colab import drive\n","drive.mount('/data/')\n","data_dir = '/data/My Drive/EMSE 6575/LogisticRegressionHomework'"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Mounted at /data/\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"wRJkv9QU1Jo-"},"source":["# Load libraries\n","import pandas as pd\n","import numpy as np\n","from sklearn import preprocessing\n","import matplotlib.pyplot as plt \n","plt.rc(\"font\", size=14)\n","from sklearn.linear_model import LogisticRegression\n","from sklearn.model_selection import train_test_split\n","import seaborn as sns\n","sns.set(style=\"white\")\n","sns.set(style=\"whitegrid\", color_codes=True)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":306},"id":"apheM3cS1LLa","executionInfo":{"status":"ok","timestamp":1613092598910,"user_tz":300,"elapsed":60474,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"08e2635e-29d7-43ae-c79f-ab531f0d9d85"},"source":["# Read the data - \n","# Data is marketing campaign data for a bank where the goal is to predict whether the client will subscribe to a term deposit.\n","train = pd.read_excel(data_dir + '/train.xlsx', header=0)\n","test = pd.read_excel(data_dir + '/test.xlsx', header=0)\n","train['train_test'] = 'train'\n","test['train_test'] = 'test'\n","data = pd.concat([train, test]).reset_index()\n","print(\"Train: \" + str(train.shape))\n","print(\"Test: \" + str(test.shape))\n","print(\"Total: \" + str(data.shape))\n","data.head()"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Train: (891, 13)\n","Test: (418, 12)\n","Total: (1309, 14)\n"],"name":"stdout"},{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>index</th>\n"," <th>PassengerId</th>\n"," <th>Survived</th>\n"," <th>Pclass</th>\n"," <th>Name</th>\n"," <th>Sex</th>\n"," <th>Age</th>\n"," <th>SibSp</th>\n"," <th>Parch</th>\n"," <th>Ticket</th>\n"," <th>Fare</th>\n"," <th>Cabin</th>\n"," <th>Embarked</th>\n"," <th>train_test</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0.0</td>\n"," <td>3</td>\n"," <td>Braund, Mr. Owen Harris</td>\n"," <td>male</td>\n"," <td>22.0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>A/5 21171</td>\n"," <td>7.2500</td>\n"," <td>NaN</td>\n"," <td>S</td>\n"," <td>train</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>1</td>\n"," <td>2</td>\n"," <td>1.0</td>\n"," <td>1</td>\n"," <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n"," <td>female</td>\n"," <td>38.0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>PC 17599</td>\n"," <td>71.2833</td>\n"," <td>C85</td>\n"," <td>C</td>\n"," <td>train</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>2</td>\n"," <td>3</td>\n"," <td>1.0</td>\n"," <td>3</td>\n"," <td>Heikkinen, Miss. Laina</td>\n"," <td>female</td>\n"," <td>26.0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>STON/O2. 3101282</td>\n"," <td>7.9250</td>\n"," <td>NaN</td>\n"," <td>S</td>\n"," <td>train</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>3</td>\n"," <td>4</td>\n"," <td>1.0</td>\n"," <td>1</td>\n"," <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n"," <td>female</td>\n"," <td>35.0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>113803</td>\n"," <td>53.1000</td>\n"," <td>C123</td>\n"," <td>S</td>\n"," <td>train</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>4</td>\n"," <td>5</td>\n"," <td>0.0</td>\n"," <td>3</td>\n"," <td>Allen, Mr. William Henry</td>\n"," <td>male</td>\n"," <td>35.0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>373450</td>\n"," <td>8.0500</td>\n"," <td>NaN</td>\n"," <td>S</td>\n"," <td>train</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" index PassengerId Survived Pclass ... Fare Cabin Embarked train_test\n","0 0 1 0.0 3 ... 7.2500 NaN S train\n","1 1 2 1.0 1 ... 71.2833 C85 C train\n","2 2 3 1.0 3 ... 7.9250 NaN S train\n","3 3 4 1.0 1 ... 53.1000 C123 S train\n","4 4 5 0.0 3 ... 8.0500 NaN S train\n","\n","[5 rows x 14 columns]"]},"metadata":{"tags":[]},"execution_count":3}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"8OtgE_1sVEp7","executionInfo":{"status":"ok","timestamp":1613092598911,"user_tz":300,"elapsed":60468,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"481186db-724d-4f30-cfc8-3389be05c584"},"source":["data.isnull().sum()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["index 0\n","PassengerId 0\n","Survived 418\n","Pclass 0\n","Name 0\n","Sex 0\n","Age 263\n","SibSp 0\n","Parch 0\n","Ticket 0\n","Fare 1\n","Cabin 1014\n","Embarked 2\n","train_test 0\n","dtype: int64"]},"metadata":{"tags":[]},"execution_count":4}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":142},"id":"1wfxoBVtVWvy","executionInfo":{"status":"ok","timestamp":1613092598912,"user_tz":300,"elapsed":60460,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"9691ff9a-de5f-4374-9046-bbe8ae0094d7"},"source":["(data[['Pclass', 'Survived']]\n"," .groupby(['Pclass'], as_index=False)\n"," .mean()\n"," .sort_values(by='Survived', ascending=False))"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>Pclass</th>\n"," <th>Survived</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>1</td>\n"," <td>0.629630</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>2</td>\n"," <td>0.472826</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>3</td>\n"," <td>0.242363</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" Pclass Survived\n","0 1 0.629630\n","1 2 0.472826\n","2 3 0.242363"]},"metadata":{"tags":[]},"execution_count":5}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":555},"id":"0NlFoQ9feWDt","executionInfo":{"status":"ok","timestamp":1613092600517,"user_tz":300,"elapsed":62057,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"45a03816-3195-4cd4-b8c3-717ce32d54eb"},"source":["data['Cabin'][data['Cabin'].isnull() == True] = \" \"\n","\n","def parse_cabin_letter(txt):\n"," cabin_letter = \"unknown\"\n"," if 'a' in txt.lower():\n"," cabin_letter = 'a'\n"," if 'b' in txt.lower():\n"," cabin_letter = 'b'\n"," if 'c' in txt.lower():\n"," cabin_letter = 'c'\n"," if 'd' in txt.lower():\n"," cabin_letter = 'd'\n"," if 'e' in txt.lower():\n"," cabin_letter = 'e'\n"," if 'f' in txt.lower():\n"," cabin_letter = 'f'\n"," if 'g' in txt.lower():\n"," cabin_letter = 'g'\n"," return cabin_letter\n","\n","data['cabin_letter'] = data['Cabin'].apply(lambda x: parse_cabin_letter(x)) \n","data['cabin_letter'].value_counts()\n","sns.factorplot('cabin_letter','Fare', col = 'Pclass', data=data)\n","plt.show()"],"execution_count":null,"outputs":[{"output_type":"stream","text":["/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n","A value is trying to be set on a copy of a slice from a DataFrame\n","\n","See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"," \"\"\"Entry point for launching an IPython kernel.\n","/usr/local/lib/python3.6/dist-packages/seaborn/categorical.py:3714: UserWarning: The `factorplot` function has been renamed to `catplot`. The original name will be removed in a future release. Please update your code. Note that the default `kind` in `factorplot` (`'point'`) has changed `'strip'` in `catplot`.\n"," warnings.warn(msg)\n","/usr/local/lib/python3.6/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.\n"," FutureWarning\n"],"name":"stderr"},{"output_type":"display_data","data":{"image/png":"\n","text/plain":["<Figure size 1080x360 with 3 Axes>"]},"metadata":{"tags":[]}}]},{"cell_type":"markdown","metadata":{"id":"i8mLg6-V26ac"},"source":["## Parse out additional information"]},{"cell_type":"markdown","metadata":{"id":"dwAXlONO-qOk"},"source":["Marital status"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"3PPoVQBh35iz","executionInfo":{"status":"ok","timestamp":1613092600517,"user_tz":300,"elapsed":62047,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"f5b0582d-58aa-42ea-c6fd-0c27ef3d5230"},"source":["import re\n","def parse_surname(txt):\n"," marital_status = \"unknown\"\n"," if 'mrs.' in txt.lower():\n"," marital_status = 'mrs'\n"," if 'mr.' in txt.lower():\n"," marital_status = 'mr' \n"," if 'miss.' in txt.lower():\n"," marital_status = 'miss' \n"," if 'master.' in txt.lower():\n"," marital_status = 'master' \n"," return marital_status\n","\n","data['surname'] = data['Name'].apply(lambda x: parse_surname(x)) \n","\n","died_mrs = len(data[(data['surname'] == 'mrs') & (data['Survived'] == 0) & (data['train_test'] == 'train')])\n","mrs = len(data[(data['surname'] == 'mrs') & (data['train_test'] == 'train')])\n","death_rate_mrs = died_mrs/mrs\n","print(\"Total Mrs. Surname: \" + str(mrs) + \"\\n\" + \"Mrs. Death Rate: \" + str(death_rate_mrs) + \"\\n\")\n","\n","died_mr = len(data[(data['surname'] == 'mr') & (data['Survived'] == 0) & (data['train_test'] == 'train')])\n","mr = len(data[(data['surname'] == 'mr') & (data['train_test'] == 'train')])\n","death_rate_mr = died_mr/mr\n","print(\"Total Mr. Surname: \" + str(mr) + \"\\n\" + \"Mr. Death Rate: \" + str(death_rate_mr) + \"\\n\")\n","\n","died_miss = len(data[(data['surname'] == 'miss') & (data['Survived'] == 0) & (data['train_test'] == 'train')])\n","miss = len(data[(data['surname'] == 'miss') & (data['train_test'] == 'train')])\n","death_rate_miss = died_miss/miss\n","print(\"Total Miss. Surname: \" + str(miss) + \"\\n\" + \"Miss Death Rate: \" + str(death_rate_miss) + \"\\n\")\n","\n","died_master = len(data[(data['surname'] == 'master') & (data['Survived'] == 0) & (data['train_test'] == 'train')])\n","master = len(data[(data['surname'] == 'master') & (data['train_test'] == 'train')])\n","death_rate_master = died_master/master\n","print(\"Total Master Surname: \" + str(master) + \"\\n\" + \"Master Death Rate: \" + str(death_rate_master))"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Total Mrs. Surname: 125\n","Mrs. Death Rate: 0.208\n","\n","Total Mr. Surname: 517\n","Mr. Death Rate: 0.8433268858800773\n","\n","Total Miss. Surname: 182\n","Miss Death Rate: 0.3021978021978022\n","\n","Total Master Surname: 40\n","Master Death Rate: 0.425\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"WWh0VekxQXiu","executionInfo":{"status":"ok","timestamp":1613092600518,"user_tz":300,"elapsed":62041,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"846378e0-a4f3-473c-8a83-f8873765afe8"},"source":["died_male = len(data[(data['Sex'] == 'male') & (data['Survived'] == 0) & (data['train_test'] == 'train')])\n","male = len(data[(data['Sex'] == 'male') & (data['train_test'] == 'train')])\n","death_rate_male = died_male/male\n","print(\"Total Males: \" + str(male) + \"\\n\" + \"Male Death Rate: \" + str(death_rate_male) + \"\\n\")\n","\n","died_female = len(data[(data['Sex'] == 'female') & (data['Survived'] == 0) & (data['train_test'] == 'train')])\n","female = len(data[(data['Sex'] == 'female') & (data['train_test'] == 'train')])\n","death_rate_female = died_female/female\n","print(\"Total Females: \" + str(female) + \"\\n\" + \"Female Death Rate: \" + str(death_rate_female))"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Total Males: 577\n","Male Death Rate: 0.8110918544194108\n","\n","Total Females: 314\n","Female Death Rate: 0.25796178343949044\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"v0MDfY3vL9G1"},"source":["Family size"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":390},"id":"5Conos0sL-gd","executionInfo":{"status":"ok","timestamp":1613092600519,"user_tz":300,"elapsed":62035,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"b5aa56e1-cbb4-4afd-f3c0-3c27a084b74e"},"source":["data['family_size'] = data['SibSp'].astype(int) + data['Parch'].astype(int) + 1\n","# The Sage family apparently has 11 people on the cruise\n","# The father is Mr. John George Sage since he lists 9 children\n","# The mother is Msr. Annie Bullen Sage since she lists 9 children\n","# lots of the kids use a surname of Mr. and only one person has an age listed\n","data[['Name','family_size', 'SibSp', 'Parch', 'Age']][data['family_size'] == 11]"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>Name</th>\n"," <th>family_size</th>\n"," <th>SibSp</th>\n"," <th>Parch</th>\n"," <th>Age</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>159</th>\n"," <td>Sage, Master. Thomas Henry</td>\n"," <td>11</td>\n"," <td>8</td>\n"," <td>2</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>180</th>\n"," <td>Sage, Miss. Constance Gladys</td>\n"," <td>11</td>\n"," <td>8</td>\n"," <td>2</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>201</th>\n"," <td>Sage, Mr. Frederick</td>\n"," <td>11</td>\n"," <td>8</td>\n"," <td>2</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>324</th>\n"," <td>Sage, Mr. George John Jr</td>\n"," <td>11</td>\n"," <td>8</td>\n"," <td>2</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>792</th>\n"," <td>Sage, Miss. Stella Anna</td>\n"," <td>11</td>\n"," <td>8</td>\n"," <td>2</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>846</th>\n"," <td>Sage, Mr. Douglas Bullen</td>\n"," <td>11</td>\n"," <td>8</td>\n"," <td>2</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>863</th>\n"," <td>Sage, Miss. Dorothy Edith \"Dolly\"</td>\n"," <td>11</td>\n"," <td>8</td>\n"," <td>2</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>1079</th>\n"," <td>Sage, Miss. Ada</td>\n"," <td>11</td>\n"," <td>8</td>\n"," <td>2</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>1233</th>\n"," <td>Sage, Mr. John George</td>\n"," <td>11</td>\n"," <td>1</td>\n"," <td>9</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>1251</th>\n"," <td>Sage, Master. William Henry</td>\n"," <td>11</td>\n"," <td>8</td>\n"," <td>2</td>\n"," <td>14.5</td>\n"," </tr>\n"," <tr>\n"," <th>1256</th>\n"," <td>Sage, Mrs. John (Annie Bullen)</td>\n"," <td>11</td>\n"," <td>1</td>\n"," <td>9</td>\n"," <td>NaN</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" Name family_size SibSp Parch Age\n","159 Sage, Master. Thomas Henry 11 8 2 NaN\n","180 Sage, Miss. Constance Gladys 11 8 2 NaN\n","201 Sage, Mr. Frederick 11 8 2 NaN\n","324 Sage, Mr. George John Jr 11 8 2 NaN\n","792 Sage, Miss. Stella Anna 11 8 2 NaN\n","846 Sage, Mr. Douglas Bullen 11 8 2 NaN\n","863 Sage, Miss. Dorothy Edith \"Dolly\" 11 8 2 NaN\n","1079 Sage, Miss. Ada 11 8 2 NaN\n","1233 Sage, Mr. John George 11 1 9 NaN\n","1251 Sage, Master. William Henry 11 8 2 14.5\n","1256 Sage, Mrs. John (Annie Bullen) 11 1 9 NaN"]},"metadata":{"tags":[]},"execution_count":9}]},{"cell_type":"markdown","metadata":{"id":"DqJv5L3U-s1f"},"source":["## Impute missing values"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"jGQ0fDjNXxAK","executionInfo":{"status":"ok","timestamp":1613092600519,"user_tz":300,"elapsed":62028,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"5de7d709-8c4f-44b9-dd60-960769d6ae3e"},"source":["data.groupby('surname')['Age'].mean()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["surname\n","master 5.482642\n","miss 21.774238\n","mr 32.252151\n","mrs 36.994118\n","unknown 42.656250\n","Name: Age, dtype: float64"]},"metadata":{"tags":[]},"execution_count":10}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"05--52Qn3Bim","executionInfo":{"status":"ok","timestamp":1613092600520,"user_tz":300,"elapsed":62021,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"4e2fda79-d805-482c-8997-120bab95a5a2"},"source":["print(\"Missing ages: \" + str(sum(data['Age'].isnull())))\n","data['Age'] = data['Age'].astype(float)\n","data.loc[(data.Age.isnull())&(data.surname=='mr'),'Age']=33\n","data.loc[(data.Age.isnull())&(data.surname=='mrs'),'Age']=36\n","data.loc[(data.Age.isnull())&(data.surname=='master'),'Age']=5\n","data.loc[(data.Age.isnull())&(data.surname=='miss'),'Age']=22\n","data.loc[(data.Age.isnull())&(data.surname=='unknown'),'Age']=46\n","print(\"Missing ages: \" + str(sum(data['Age'].isnull())))\n","#sns.catplot(x='Survived', y=\"Age\", kind=\"box\", dodge=False, height = 5, aspect = 1.5,data=data);"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Missing ages: 263\n","Missing ages: 0\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":390},"id":"zI6bMe7MOGdu","executionInfo":{"status":"ok","timestamp":1613092600521,"user_tz":300,"elapsed":62015,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"83f3926b-c153-4868-d90e-88c58569cfdb"},"source":["data[['Name','family_size', 'SibSp', 'Parch', 'Age']][data['family_size'] == 11]"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>Name</th>\n"," <th>family_size</th>\n"," <th>SibSp</th>\n"," <th>Parch</th>\n"," <th>Age</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>159</th>\n"," <td>Sage, Master. Thomas Henry</td>\n"," <td>11</td>\n"," <td>8</td>\n"," <td>2</td>\n"," <td>5.0</td>\n"," </tr>\n"," <tr>\n"," <th>180</th>\n"," <td>Sage, Miss. Constance Gladys</td>\n"," <td>11</td>\n"," <td>8</td>\n"," <td>2</td>\n"," <td>22.0</td>\n"," </tr>\n"," <tr>\n"," <th>201</th>\n"," <td>Sage, Mr. Frederick</td>\n"," <td>11</td>\n"," <td>8</td>\n"," <td>2</td>\n"," <td>33.0</td>\n"," </tr>\n"," <tr>\n"," <th>324</th>\n"," <td>Sage, Mr. George John Jr</td>\n"," <td>11</td>\n"," <td>8</td>\n"," <td>2</td>\n"," <td>33.0</td>\n"," </tr>\n"," <tr>\n"," <th>792</th>\n"," <td>Sage, Miss. Stella Anna</td>\n"," <td>11</td>\n"," <td>8</td>\n"," <td>2</td>\n"," <td>22.0</td>\n"," </tr>\n"," <tr>\n"," <th>846</th>\n"," <td>Sage, Mr. Douglas Bullen</td>\n"," <td>11</td>\n"," <td>8</td>\n"," <td>2</td>\n"," <td>33.0</td>\n"," </tr>\n"," <tr>\n"," <th>863</th>\n"," <td>Sage, Miss. Dorothy Edith \"Dolly\"</td>\n"," <td>11</td>\n"," <td>8</td>\n"," <td>2</td>\n"," <td>22.0</td>\n"," </tr>\n"," <tr>\n"," <th>1079</th>\n"," <td>Sage, Miss. Ada</td>\n"," <td>11</td>\n"," <td>8</td>\n"," <td>2</td>\n"," <td>22.0</td>\n"," </tr>\n"," <tr>\n"," <th>1233</th>\n"," <td>Sage, Mr. John George</td>\n"," <td>11</td>\n"," <td>1</td>\n"," <td>9</td>\n"," <td>33.0</td>\n"," </tr>\n"," <tr>\n"," <th>1251</th>\n"," <td>Sage, Master. William Henry</td>\n"," <td>11</td>\n"," <td>8</td>\n"," <td>2</td>\n"," <td>14.5</td>\n"," </tr>\n"," <tr>\n"," <th>1256</th>\n"," <td>Sage, Mrs. John (Annie Bullen)</td>\n"," <td>11</td>\n"," <td>1</td>\n"," <td>9</td>\n"," <td>36.0</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" Name family_size SibSp Parch Age\n","159 Sage, Master. Thomas Henry 11 8 2 5.0\n","180 Sage, Miss. Constance Gladys 11 8 2 22.0\n","201 Sage, Mr. Frederick 11 8 2 33.0\n","324 Sage, Mr. George John Jr 11 8 2 33.0\n","792 Sage, Miss. Stella Anna 11 8 2 22.0\n","846 Sage, Mr. Douglas Bullen 11 8 2 33.0\n","863 Sage, Miss. Dorothy Edith \"Dolly\" 11 8 2 22.0\n","1079 Sage, Miss. Ada 11 8 2 22.0\n","1233 Sage, Mr. John George 11 1 9 33.0\n","1251 Sage, Master. William Henry 11 8 2 14.5\n","1256 Sage, Mrs. John (Annie Bullen) 11 1 9 36.0"]},"metadata":{"tags":[]},"execution_count":12}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"jWuSOix4HFtT","executionInfo":{"status":"ok","timestamp":1613092600521,"user_tz":300,"elapsed":62008,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"c04c1b74-6aa9-45f6-ee9b-f3abb78b9794"},"source":["print(\"Missing Embarked Values: \" + str(sum(data['Embarked'].isnull())))\n","# replace with most common value since there are only 2 values missing\n","data['Embarked'].value_counts()\n","data['Embarked'].fillna(\"S\", inplace=True)\n","print(\"Missing Embarked Values: \" + str(sum(data['Embarked'].isnull())))"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Missing Embarked Values: 2\n","Missing Embarked Values: 0\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"h2jNwCxDLQ-j","executionInfo":{"status":"ok","timestamp":1613092600522,"user_tz":300,"elapsed":62003,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"5e606ef5-f3c4-4117-b5de-540045bd1d7f"},"source":["print(\"Missing Fares: \" + str(sum(data['Fare'].isnull())))\n","data['Fare'] = data['Fare'].astype(float)\n","data['Fare_median'] = data.groupby(['Pclass'])['Fare'].transform('median')\n","data['Fare'] = np.where(data['Fare'].isnull(), data['Fare_median'], data['Fare'])\n","print(\"Missing Fares: \" + str(sum(data['Fare'].isnull())))"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Missing Fares: 1\n","Missing Fares: 0\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"qX72fexvLmm_"},"source":["## Bin continuous variables into categories"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"pCRQTVTjLql5","executionInfo":{"status":"ok","timestamp":1613092600523,"user_tz":300,"elapsed":61997,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"de477675-0d5e-43e8-eb66-cfed0bc7df3a"},"source":["bins = [0, 15, 25, 35, 60, 81] # binning\n","bin_names = ['child','young','adult','middle','senior'] # label tagging\n","data['Age_bin'] = pd.cut(data['Age'],\n"," bins = bins,\n"," labels=bin_names,\n"," include_lowest = True)\n","data['Age_bin'].value_counts()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["adult 457\n","young 378\n","middle 318\n","child 123\n","senior 33\n","Name: Age_bin, dtype: int64"]},"metadata":{"tags":[]},"execution_count":15}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"MB2B8lS0ZGXa","executionInfo":{"status":"ok","timestamp":1613092600886,"user_tz":300,"elapsed":62354,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"116d867d-0773-4d67-9d9b-9e6aeced31f0"},"source":["data['alone'] = False\n","data['alone'][data['family_size'] == 1] = True\n","data['alone'].value_counts()"],"execution_count":null,"outputs":[{"output_type":"stream","text":["/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n","A value is trying to be set on a copy of a slice from a DataFrame\n","\n","See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"," \n"],"name":"stderr"},{"output_type":"execute_result","data":{"text/plain":["True 790\n","False 519\n","Name: alone, dtype: int64"]},"metadata":{"tags":[]},"execution_count":16}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"3rn74OFaZs8W","executionInfo":{"status":"ok","timestamp":1613092600887,"user_tz":300,"elapsed":62351,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"7935a36c-bcc0-4c7f-b18a-cd3619d12135"},"source":["bins = [0, 8, 30, 100] # binning\n","bin_names = ['economy','business','first_class'] # label tagging\n","data['fare_bin'] = pd.cut(data['Fare'],\n"," bins = bins,\n"," labels=bin_names,\n"," include_lowest = True)\n","data['fare_bin'].value_counts()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["business 606\n","economy 360\n","first_class 259\n","Name: fare_bin, dtype: int64"]},"metadata":{"tags":[]},"execution_count":17}]},{"cell_type":"markdown","metadata":{"id":"8uGStVSmL61d"},"source":["## One-hot encode categorical variables"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"WXIjEWOsRfMe","executionInfo":{"status":"ok","timestamp":1613092600888,"user_tz":300,"elapsed":62348,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"069af06e-6bd7-4ef7-dae5-63ac721e55b3"},"source":["# One-hot encode the categorical variables \n","cat_vars=['Age_bin', 'surname', 'Embarked', 'Sex', 'Pclass', 'alone', 'fare_bin']\n","for var in cat_vars:\n"," cat_list='var'+'_'+var\n"," cat_list = pd.get_dummies(data[var], prefix=var)\n"," data1=data.join(cat_list)\n"," data=data1\n","\n","data_vars=data.columns.values.tolist()\n","to_keep=[i for i in data_vars if i not in cat_vars]\n","\n","data_final=data[to_keep]\n","data_final.columns.values"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["array(['index', 'PassengerId', 'Survived', 'Name', 'Age', 'SibSp',\n"," 'Parch', 'Ticket', 'Fare', 'Cabin', 'train_test', 'cabin_letter',\n"," 'family_size', 'Fare_median', 'Age_bin_child', 'Age_bin_young',\n"," 'Age_bin_adult', 'Age_bin_middle', 'Age_bin_senior',\n"," 'surname_master', 'surname_miss', 'surname_mr', 'surname_mrs',\n"," 'surname_unknown', 'Embarked_C', 'Embarked_Q', 'Embarked_S',\n"," 'Sex_female', 'Sex_male', 'Pclass_1', 'Pclass_2', 'Pclass_3',\n"," 'alone_False', 'alone_True', 'fare_bin_economy',\n"," 'fare_bin_business', 'fare_bin_first_class'], dtype=object)"]},"metadata":{"tags":[]},"execution_count":18}]},{"cell_type":"markdown","metadata":{"id":"B5NaeogAVFyF"},"source":["## Scale numeric values"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"XEwxI7KuVE5L","executionInfo":{"status":"ok","timestamp":1613092600888,"user_tz":300,"elapsed":62343,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"58ce28ef-edf9-477a-b8e6-e759aca8c555"},"source":["import sklearn.preprocessing\n","from sklearn.preprocessing import StandardScaler\n","from sklearn.preprocessing import MinMaxScaler\n","sc = StandardScaler()\n","\n","numeric_vars = ['family_size', 'Age', 'Fare']\n","\n","# #only standardize numerical features\n","features=data_final[numeric_vars]\n","features_standard=StandardScaler().fit_transform(features)# Gaussian Standardisation\n","temp=pd.DataFrame(features_standard,columns=numeric_vars)\n","#temp=pd.DataFrame(features,columns=numeric_vars)\n","temp.head()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>family_size</th>\n"," <th>Age</th>\n"," <th>Fare</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>0.073352</td>\n"," <td>-0.605528</td>\n"," <td>-0.503176</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>0.073352</td>\n"," <td>0.605797</td>\n"," <td>0.734809</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>-0.558346</td>\n"," <td>-0.302696</td>\n"," <td>-0.490126</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>0.073352</td>\n"," <td>0.378674</td>\n"," <td>0.383263</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>-0.558346</td>\n"," <td>0.378674</td>\n"," <td>-0.487709</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" family_size Age Fare\n","0 0.073352 -0.605528 -0.503176\n","1 0.073352 0.605797 0.734809\n","2 -0.558346 -0.302696 -0.490126\n","3 0.073352 0.378674 0.383263\n","4 -0.558346 0.378674 -0.487709"]},"metadata":{"tags":[]},"execution_count":19}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"iE2NhbpmV1YI","executionInfo":{"status":"ok","timestamp":1613092600889,"user_tz":300,"elapsed":62339,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"2f463e58-fd2e-426a-bbcf-d3101c231e82"},"source":["cat_data = data_final[['Survived', 'train_test', 'Age_bin_child', 'Pclass_1', 'Pclass_2', 'Pclass_3',\n"," 'Age_bin_young', 'Age_bin_adult', 'Age_bin_middle', 'Age_bin_senior',\n"," 'surname_master', 'surname_miss', 'surname_mr', 'surname_mrs', \n"," 'surname_unknown', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Sex_female', 'Sex_male',\n"," 'alone_False', 'alone_True', 'fare_bin_economy', 'fare_bin_business', 'fare_bin_first_class']]\n","data_final_after_standardizing = pd.concat([cat_data.reset_index(drop=True), temp], axis=1)\n","data_final = data_final_after_standardizing\n","data_final.head()"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>Survived</th>\n"," <th>train_test</th>\n"," <th>Age_bin_child</th>\n"," <th>Pclass_1</th>\n"," <th>Pclass_2</th>\n"," <th>Pclass_3</th>\n"," <th>Age_bin_young</th>\n"," <th>Age_bin_adult</th>\n"," <th>Age_bin_middle</th>\n"," <th>Age_bin_senior</th>\n"," <th>surname_master</th>\n"," <th>surname_miss</th>\n"," <th>surname_mr</th>\n"," <th>surname_mrs</th>\n"," <th>surname_unknown</th>\n"," <th>Embarked_C</th>\n"," <th>Embarked_Q</th>\n"," <th>Embarked_S</th>\n"," <th>Sex_female</th>\n"," <th>Sex_male</th>\n"," <th>alone_False</th>\n"," <th>alone_True</th>\n"," <th>fare_bin_economy</th>\n"," <th>fare_bin_business</th>\n"," <th>fare_bin_first_class</th>\n"," <th>family_size</th>\n"," <th>Age</th>\n"," <th>Fare</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>0.0</td>\n"," <td>train</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0.073352</td>\n"," <td>-0.605528</td>\n"," <td>-0.503176</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>1.0</td>\n"," <td>train</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0.073352</td>\n"," <td>0.605797</td>\n"," <td>0.734809</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>1.0</td>\n"," <td>train</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>-0.558346</td>\n"," <td>-0.302696</td>\n"," <td>-0.490126</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>1.0</td>\n"," <td>train</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0.073352</td>\n"," <td>0.378674</td>\n"," <td>0.383263</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>0.0</td>\n"," <td>train</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>1</td>\n"," <td>0</td>\n"," <td>-0.558346</td>\n"," <td>0.378674</td>\n"," <td>-0.487709</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" Survived train_test Age_bin_child ... family_size Age Fare\n","0 0.0 train 0 ... 0.073352 -0.605528 -0.503176\n","1 1.0 train 0 ... 0.073352 0.605797 0.734809\n","2 1.0 train 0 ... -0.558346 -0.302696 -0.490126\n","3 1.0 train 0 ... 0.073352 0.378674 0.383263\n","4 0.0 train 0 ... -0.558346 0.378674 -0.487709\n","\n","[5 rows x 28 columns]"]},"metadata":{"tags":[]},"execution_count":20}]},{"cell_type":"markdown","metadata":{"id":"xWOmvxLrXDOw"},"source":["## Recursive Feature Elimination"]},{"cell_type":"code","metadata":{"id":"gYwpgvZ_XRSO"},"source":["X_train = data_final[data_final['train_test'] == \"train\"]\n","X_train = X_train.loc[:, X_train.columns != 'Survived']\n","X_train = X_train.loc[:, X_train.columns != 'train_test']\n","\n","y_train = data_final[data_final['train_test'] == \"train\"]\n","y_train = y_train.loc[:, y_train.columns == 'Survived']"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Q056M40IXCOy","executionInfo":{"status":"ok","timestamp":1613092602160,"user_tz":300,"elapsed":63603,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"3ebd8ab0-11c5-40a6-f959-4d1633585f1d"},"source":["import warnings\n","from sklearn.feature_selection import RFE\n","from sklearn.linear_model import LogisticRegression\n","warnings.filterwarnings('ignore')\n","\n","data_final_vars=data_final.columns.values.tolist()\n","logreg = LogisticRegression()\n","rfe = RFE(logreg, 16)\n","rfe = rfe.fit(X_train, y_train.values.ravel())\n","print(rfe.support_)\n","print(rfe.ranking_)\n","\n","from itertools import compress\n","cols = list(compress(X_train.columns, rfe.support_))\n","X=X_train[cols]\n","y=y_train['Survived']\n","\n","# Implement the model\n","import statsmodels.api as sm\n","logit_model=sm.Logit(y,X)\n","result=logit_model.fit()\n","print(result.summary2())"],"execution_count":null,"outputs":[{"output_type":"stream","text":["[False True False True True False False False True True True True\n"," True False False True True True False True False False True True\n"," True True]\n","[ 6 1 10 1 1 7 11 4 1 1 1 1 1 8 9 1 1 1 3 1 2 5 1 1\n"," 1 1]\n","Optimization terminated successfully.\n"," Current function value: 0.402754\n"," Iterations 19\n"," Results: Logit\n","=======================================================================================\n","Model: Logit Pseudo R-squared: 0.395 \n","Dependent Variable: Survived AIC: 747.7080 \n","Date: 2021-02-12 01:16 BIC: 819.5932 \n","No. Observations: 891 Log-Likelihood: -358.85 \n","Df Model: 14 LL-Null: -593.33 \n","Df Residuals: 876 LLR p-value: 3.4984e-91\n","Converged: 1.0000 Scale: 1.0000 \n","No. Iterations: 19.0000 \n","---------------------------------------------------------------------------------------\n"," Coef. Std.Err. z P>|z| [0.025 0.975] \n","---------------------------------------------------------------------------------------\n","Pclass_1 0.8267 0.3461 2.3883 0.0169 0.1483 1.5051\n","Pclass_3 -1.0152 0.2563 -3.9610 0.0001 -1.5175 -0.5128\n","Age_bin_young -0.3577 0.2468 -1.4495 0.1472 -0.8414 0.1260\n","surname_master 33.2789 18908497.6617 0.0000 1.0000 -37059941.1399 37060007.6976\n","surname_miss -41.2830 nan nan nan nan nan\n","surname_mr 30.1523 18908497.6617 0.0000 1.0000 -37059944.2665 37060004.5710\n","surname_mrs -40.4684 nan nan nan nan nan\n","surname_unknown 30.1393 18908497.6617 0.0000 1.0000 -37059944.2794 37060004.5580\n","Embarked_S -0.3726 0.2173 -1.7150 0.0863 -0.7985 0.0532\n","Sex_female 43.0897 nan nan nan nan nan\n","Sex_male -31.2708 18908497.6617 -0.0000 1.0000 -37060005.6895 37059943.1480\n","alone_True -0.3482 0.3071 -1.1338 0.2569 -0.9501 0.2537\n","fare_bin_first_class 0.4847 0.2806 1.7275 0.0841 -0.0652 1.0346\n","family_size -0.9611 0.1801 -5.3373 0.0000 -1.3141 -0.6082\n","Age -0.4611 0.1407 -3.2772 0.0010 -0.7369 -0.1853\n","Fare 0.2208 0.1388 1.5906 0.1117 -0.0513 0.4928\n","=======================================================================================\n","\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"oFeduKzxaaNN"},"source":["# remove_cols = ['Sex_male', 'surname_mr', 'surname_master', 'surname_unknown']\n","# new_cols = [x for x in cols if x not in remove_cols]\n","# X=X_train[new_cols]\n","# y=y_train['Survived']\n","# logit_model=sm.Logit(y,X)\n","# result=logit_model.fit()\n","# print(result.summary2())"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Yptr-0gUawb6","executionInfo":{"status":"ok","timestamp":1613092602162,"user_tz":300,"elapsed":63597,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"27ab30fb-59da-49df-f692-1f6574e8c695"},"source":["from sklearn.linear_model import LogisticRegression\n","from sklearn import metrics\n","logreg = LogisticRegression()\n","logreg.fit(X, y)\n","X.columns"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["Index(['Pclass_1', 'Pclass_3', 'Age_bin_young', 'surname_master',\n"," 'surname_miss', 'surname_mr', 'surname_mrs', 'surname_unknown',\n"," 'Embarked_S', 'Sex_female', 'Sex_male', 'alone_True',\n"," 'fare_bin_first_class', 'family_size', 'Age', 'Fare'],\n"," dtype='object')"]},"metadata":{"tags":[]},"execution_count":24}]},{"cell_type":"code","metadata":{"id":"JBz7mo3fbAzx"},"source":["X_test = data_final[data_final['train_test'] == \"test\"]\n","X_test = X_test[cols].reset_index(drop = True)\n","#y_test = data_final[data_final['train_test'] == \"test\"]\n","#y_test = y_test.loc[:, y_test.columns == 'Survived']"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"VagUFjzTa7sQ"},"source":["y_pred = logreg.predict(X_test)\n","submission1 = pd.DataFrame({\"PassengerId\": test['PassengerId'].tolist(),\n"," \"Survived\": y_pred})\n","submission1.to_csv(data_dir + \"/koban_submission1.csv\")"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"s335sKrYxjrP"},"source":["# First submission was a hot mess inside a dumpster fire. I ranked in the bottom 20% of submissions. This time we will do model selection using the training data for test and train sets."]},{"cell_type":"code","metadata":{"id":"kTEoCmQZxe3Q"},"source":["X_train = data_final[data_final['train_test'] == \"train\"]\n","X_train = X_train.loc[:, X_train.columns != 'Survived']\n","X_train = X_train.loc[:, X_train.columns != 'train_test']\n","\n","y_train = data_final[data_final['train_test'] == \"train\"]\n","y_train = y_train.loc[:, y_train.columns == 'Survived']\n","\n","X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train, y_train, test_size=0.2, random_state=0)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"CQsCHNIj2bau","executionInfo":{"status":"ok","timestamp":1613092603205,"user_tz":300,"elapsed":64614,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"d557c57b-0743-4cfb-a3ac-3bdcb7ad56c6"},"source":["logreg = LogisticRegression()\n","rfe = RFE(logreg, 16)\n","rfe = rfe.fit(X_train2, y_train2.values.ravel())\n","print(rfe.support_)\n","print(rfe.ranking_)\n","\n","cols = list(compress(X_train.columns, rfe.support_))\n","X=X_train[cols]\n","y=y_train['Survived']\n","\n","# Implement the model\n","import statsmodels.api as sm\n","logit_model=sm.Logit(y,X)\n","result=logit_model.fit()\n","print(result.summary2())"],"execution_count":null,"outputs":[{"output_type":"stream","text":["[False True False True True False False False True True True True\n"," True False False True True True True False True False True True\n"," True False]\n","[10 1 8 1 1 9 4 11 1 1 1 1 1 7 6 1 1 1 1 5 1 3 1 1\n"," 1 2]\n","Optimization terminated successfully.\n"," Current function value: 0.403995\n"," Iterations 20\n"," Results: Logit\n","=======================================================================================\n","Model: Logit Pseudo R-squared: 0.393 \n","Dependent Variable: Survived AIC: 749.9195 \n","Date: 2021-02-12 01:16 BIC: 821.8046 \n","No. Observations: 891 Log-Likelihood: -359.96 \n","Df Model: 14 LL-Null: -593.33 \n","Df Residuals: 876 LLR p-value: 1.0276e-90\n","Converged: 1.0000 Scale: 1.0000 \n","No. Iterations: 20.0000 \n","---------------------------------------------------------------------------------------\n"," Coef. Std.Err. z P>|z| [0.025 0.975] \n","---------------------------------------------------------------------------------------\n","Pclass_1 1.0840 0.3108 3.4872 0.0005 0.4747 1.6932\n","Pclass_3 -1.1732 0.2786 -4.2112 0.0000 -1.7193 -0.6272\n","Age_bin_young -0.3474 0.2472 -1.4056 0.1598 -0.8319 0.1370\n","surname_master 56.4272 8074028.1653 0.0000 1.0000 -15824747.9869 15824860.8413\n","surname_miss -72.4301 12498303.6419 -0.0000 1.0000 -24496297.4361 24496152.5759\n","surname_mr 53.3320 8074028.1653 0.0000 1.0000 -15824751.0821 15824857.7461\n","surname_mrs -71.6076 12498303.6419 -0.0000 1.0000 -24496296.6136 24496153.3984\n","surname_unknown 53.2466 8074028.1653 0.0000 1.0000 -15824751.1676 15824857.6607\n","Embarked_S -0.4045 0.2152 -1.8796 0.0602 -0.8263 0.0173\n","Sex_female 73.8304 12498303.6419 0.0000 1.0000 -24496151.1757 24496298.8364\n","Sex_male -54.8623 8074028.1653 -0.0000 1.0000 -15824859.2765 15824749.5518\n","alone_False 0.4389 0.3094 1.4184 0.1561 -0.1676 1.0454\n","fare_bin_economy 0.2492 0.2903 0.8585 0.3906 -0.3197 0.8181\n","fare_bin_first_class 0.4265 0.2791 1.5281 0.1265 -0.1205 0.9736\n","family_size -0.8862 0.1740 -5.0932 0.0000 -1.2272 -0.5451\n","Age -0.4702 0.1406 -3.3437 0.0008 -0.7458 -0.1946\n","=======================================================================================\n","\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"CrDttQMWyh5i","executionInfo":{"status":"ok","timestamp":1613092603425,"user_tz":300,"elapsed":64828,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"8dae646a-c4db-4bc0-dcfa-d3d0b2cdee7b"},"source":["from sklearn.linear_model import LogisticRegression\n","from sklearn import metrics\n","logreg = LogisticRegression()\n","logreg.fit(X, y)\n","y_pred = logreg.predict(X_test2[cols])\n","\n","print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test2[cols], y_test2)))\n","\n","cm = metrics.confusion_matrix(y_test2, y_pred)\n","true_pos = cm[1,1]\n","true_neg = cm[0,0]\n","false_pos = cm[0,1]\n","false_neg = cm[1,0]\n","precision = true_pos/(true_pos + false_pos)\n","recall = true_pos/(true_pos + false_neg)\n","\n","print(\"When we check precision of our model against the test data set, \" + str(len(y_test2)) + \" passengers\")\n","print(\"\")\n","print(\"Precision - We predicted survival \" + str(cm[1,1] + cm[0,1]) + \" times and were correct \" + str(cm[1,1]) + \" times: \" + str(cm[1,1]/(cm[1,1] + cm[0,1]))[0:5])\n","print(\"Recall - We predicted \" + str(cm[1,1]) + \" out of the \" + str(cm[1,1] + cm[1,0]) + \" surviving passengers: \" + str(recall)[0:5])\n","print(\"\")\n","print(\"Our total accuracy was \" + str((cm[0,0] + cm[1,1])/len(y_test2))[0:5])\n","print(\"Our F1 score was \" + str(2*(precision*recall)/(precision+recall))[0:5])"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Accuracy of logistic regression classifier on test set: 0.83\n","When we check precision of our model against the test data set, 179 passengers\n","\n","Precision - We predicted survival 70 times and were correct 54 times: 0.771\n","Recall - We predicted 54 out of the 69 surviving passengers: 0.782\n","\n","Our total accuracy was 0.826\n","Our F1 score was 0.776\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"ScqylR9I4hl_"},"source":["## Logistic Regression classifier still not performing, so we will try a Random Forest"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"VKz1-_nZ4g23","executionInfo":{"status":"ok","timestamp":1613092691438,"user_tz":300,"elapsed":14720,"user":{"displayName":"Donald Koban","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgIl_q-klTdMSVMcpQ2RqU9YBN_aDPqg2-7Pd4=s64","userId":"12205738029019728376"}},"outputId":"a8e4903c-8f7b-40ad-944e-ee5d3fe86c8e"},"source":["from sklearn.model_selection import cross_val_score, cross_val_predict, KFold\n","from sklearn.ensemble import RandomForestClassifier\n","from statistics import mean\n","from numpy import std\n","\n","cv = KFold(n_splits=10, random_state = 1, shuffle = False)\n","rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion=\"gini\")\n","features = vars\n","accuracy = cross_val_score(rf, X_train, y_train, scoring = 'accuracy', cv = cv)\n","accuracy = list(accuracy)\n","f1_scores = cross_val_score(rf, X_train, y_train, scoring = 'f1', cv = cv)\n","f1_scores = list(f1_scores)\n","precision_scores = cross_val_score(rf, X_train, y_train, scoring = 'precision', cv = cv)\n","precision_scores = list(precision_scores)\n","recall_scores = cross_val_score(rf, X_train, y_train, scoring = 'recall', cv = cv)\n","recall_scores = list(recall_scores)\n","\n","print('accuracy score: ' + str(mean(accuracy))[0:5] + \" +/- \" + str(std(accuracy))[0:5])\n","print('f1 score: ' + str(mean(f1_scores))[0:5] + \" +/- \" + str(std(f1_scores))[0:5])\n","print('precision: ' + str(mean(precision_scores))[0:5] + \" +/- \" + str(std(precision_scores))[0:5])\n","print('recall: '+ str(mean(recall_scores))[0:5] + \" +/- \" + str(std(recall_scores))[0:5])"],"execution_count":null,"outputs":[{"output_type":"stream","text":["accuracy score: 0.799 +/- 0.041\n","f1 score: 0.737 +/- 0.053\n","precision: 0.756 +/- 0.067\n","recall: 0.715 +/- 0.084\n"],"name":"stdout"}]}]}