{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Supervised Machine Learning basics: Titanic example"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Florent Leclercq,
\n",
"Imperial Centre for Inference and Cosmology, Imperial College London,
\n",
"florent.leclercq@polytechnique.org"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.ensemble import RandomForestClassifier"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load training data set"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# data set available at this address: https://www.kaggle.com/c/titanic/data\n",
"# (version slightly modified to be conveniently loaded with numpy)\n",
"dtype = {'names':('PassengerId','Survived','Pclass','Name','Sex',\n",
" 'Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked'),\n",
" 'formats': ('i4','i4','i4','S20','S6','S20','i4','i4','S20','f8','S20','S20')}\n",
"data = np.loadtxt(\"data/titanic.csv\", dtype=dtype, delimiter=\";\", comments=\"#\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Data dictionary\n",
"\n",
"| Variable | Definition | Key |\n",
"|----------|--------------------------------------------|------------------------------------------------|\n",
"| Survived | Survival | 0 = No, 1 = Yes |\n",
"| Pclass | Ticket class | 1 = 1st, 2 = 2nd, 3 = 3rd |\n",
"| Sex | Sex | |\n",
"| Age | Age in years | |\n",
"| SibSp | # of siblings / spouses aboard the Titanic | |\n",
"| Parch | # of parents / children aboard the Titanic | |\n",
"| Ticket | Ticket number | |\n",
"| Fare | Passenger fare | |\n",
"| Cabin | Cabin number | |\n",
"| Embarked | Port of Embarkation | C = Cherbourg, Q = Queenstown, S = Southampton |"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"(501, 0, 3, b'\"Calic, Mr. Petar\"', b'male', b'17', 0, 0, b'315086', 8.6625, b'', b'S')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[500]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(601, 1, 2, b'\"Jacobsohn, Mrs. Sid', b'female', b'24', 2, 1, b'243847', 27., b'', b'S')"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[600]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def perform_splitting(condition):\n",
" selected_passengers = data[np.where(condition)]\n",
" Nsplit = selected_passengers.size\n",
" Nsurvived_split = np.sum(selected_passengers['Survived']==1)\n",
" Ndied_split = np.sum(selected_passengers['Survived']==0)\n",
" return Nsurvived_split, Ndied_split, Nsplit\n",
" \n",
"def entropy(Nsurvived, Ndied, Ntot):\n",
" assert(Nsurvived + Ndied == Ntot)\n",
" return -Nsurvived/Ntot*np.log2(Nsurvived/Ntot) -Ndied/Ntot*np.log2(Ndied/Ntot)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Parent Entropy"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ndied=549, Nsurvived=342, Ntot=891\n",
"H_parent=0.9607079018756469\n"
]
}
],
"source": [
"Ntot = data['Survived'].size\n",
"Nsurvived = np.sum(data['Survived']==1)\n",
"Ndied = np.sum(data['Survived']==0)\n",
"H_parent = entropy(Nsurvived, Ndied, Ntot)\n",
"print(\"Ndied={}, Nsurvived={}, Ntot={}\".format(Ndied,Nsurvived,Ntot))\n",
"print(\"H_parent={}\".format(H_parent))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Information gain: ticket class"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ndied_first=80, Nsurvived_first=136, Nfirst=216\n",
"H_first=0.9509560484549725\n"
]
}
],
"source": [
"Nsurvived_first, Ndied_first, Nfirst = perform_splitting(data['Pclass']==1)\n",
"H_first = entropy(Nsurvived_first, Ndied_first, Nfirst)\n",
"print(\"Ndied_first={}, Nsurvived_first={}, Nfirst={}\".format(Ndied_first,Nsurvived_first,Nfirst))\n",
"print(\"H_first={}\".format(H_first))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ndied_second=97, Nsurvived_second=87, Nsecond=184\n",
"H_second=0.9978683156711936\n"
]
}
],
"source": [
"Nsurvived_second, Ndied_second, Nsecond = perform_splitting(data['Pclass']==2)\n",
"H_second = entropy(Nsurvived_second, Ndied_second, Nsecond)\n",
"print(\"Ndied_second={}, Nsurvived_second={}, Nsecond={}\".format(Ndied_second,Nsurvived_second,Nsecond))\n",
"print(\"H_second={}\".format(H_second))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ndied_third=372, Nsurvived_third=119, Nthird=491\n",
"H_third=0.7989470522661535\n"
]
}
],
"source": [
"Nsurvived_third, Ndied_third, Nthird = perform_splitting(data['Pclass']==3)\n",
"H_third = entropy(Nsurvived_third, Ndied_third, Nthird)\n",
"print(\"Ndied_third={}, Nsurvived_third={}, Nthird={}\".format(Ndied_third,Nsurvived_third,Nthird))\n",
"print(\"H_third={}\".format(H_third))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"IG_class=0.0838310452960116\n"
]
}
],
"source": [
"H_class = Nfirst/Ntot*H_first + Nsecond/Ntot*H_second + Nthird/Ntot*H_third\n",
"IG_class = H_parent - H_class\n",
"print(\"IG_class={}\".format(IG_class))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Information gain: sex"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ndied_male=468, Nsurvived_male=109, Nmale=577\n",
"H_male=0.6991817891208407\n"
]
}
],
"source": [
"Nsurvived_male, Ndied_male, Nmale = perform_splitting(data['Sex']==b'male')\n",
"H_male = entropy(Nsurvived_male, Ndied_male, Nmale)\n",
"print(\"Ndied_male={}, Nsurvived_male={}, Nmale={}\".format(Ndied_male,Nsurvived_male,Nmale))\n",
"print(\"H_male={}\".format(H_male))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ndied_female=81, Nsurvived_female=233, Nfemale=314\n",
"H_female=0.8236550739295191\n"
]
}
],
"source": [
"Nsurvived_female, Ndied_female, Nfemale = perform_splitting(data['Sex']==b'female')\n",
"H_female = entropy(Nsurvived_female, Ndied_female, Nfemale)\n",
"print(\"Ndied_female={}, Nsurvived_female={}, Nfemale={}\".format(Ndied_female,Nsurvived_female,Nfemale))\n",
"print(\"H_female={}\".format(H_female))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"IG_sex=0.2176601066606142\n"
]
}
],
"source": [
"H_sex = Nmale/Ntot*H_male + Nfemale/Ntot*H_female\n",
"IG_sex = H_parent - H_sex\n",
"print(\"IG_sex={}\".format(IG_sex))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Information gain: port of embarkation"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ndied_cherbourg=75, Nsurvived_cherbourg=93, Ncherbourg=168\n",
"H_cherbourg=0.9917033083725818\n"
]
}
],
"source": [
"Nsurvived_cherbourg, Ndied_cherbourg, Ncherbourg = perform_splitting(data['Embarked']==b'C')\n",
"H_cherbourg = entropy(Nsurvived_cherbourg, Ndied_cherbourg, Ncherbourg)\n",
"print(\"Ndied_cherbourg={}, Nsurvived_cherbourg={}, Ncherbourg={}\"\n",
" .format(Ndied_cherbourg,Nsurvived_cherbourg,Ncherbourg))\n",
"print(\"H_cherbourg={}\".format(H_cherbourg))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ndied_queenstown=47, Nsurvived_queenstown=30, Nqueenstown=77\n",
"H_queenstown=0.9645476589143234\n"
]
}
],
"source": [
"Nsurvived_queenstown, Ndied_queenstown, Nqueenstown = perform_splitting(data['Embarked']==b'Q')\n",
"H_queenstown = entropy(Nsurvived_queenstown, Ndied_queenstown, Nqueenstown)\n",
"print(\"Ndied_queenstown={}, Nsurvived_queenstown={}, Nqueenstown={}\"\n",
" .format(Ndied_queenstown,Nsurvived_queenstown,Nqueenstown))\n",
"print(\"H_queenstown={}\".format(H_queenstown))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Ndied_southampton=427, Nsurvived_southampton=217, Nsouthampton=644\n",
"H_southampton=0.921876486346913\n"
]
}
],
"source": [
"Nsurvived_southampton, Ndied_southampton, Nsouthampton = perform_splitting(data['Embarked']==b'S')\n",
"H_southampton = entropy(Nsurvived_southampton, Ndied_southampton, Nsouthampton)\n",
"print(\"Ndied_southampton={}, Nsurvived_southampton={}, Nsouthampton={}\"\n",
" .format(Ndied_southampton,Nsurvived_southampton,Nsouthampton))\n",
"print(\"H_southampton={}\".format(H_southampton))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"IG_embarked=0.024047090707960517\n"
]
}
],
"source": [
"H_embarked = Ncherbourg/Ntot*H_cherbourg + Nqueenstown/Ntot*H_queenstown + Nsouthampton/Ntot*H_southampton\n",
"IG_embarked = H_parent - H_embarked\n",
"print(\"IG_embarked={}\".format(IG_embarked))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Off-the-shelf machine learning algorithm"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"data_Sex=np.zeros(len(data),dtype=np.int)\n",
"data_Sex[np.where(data['Sex']==b'male')]=1\n",
"data_Sex[np.where(data['Sex']==b'female')]=2\n",
"data_Embarked=3*np.ones(len(data),dtype=np.int)\n",
"data_Embarked[np.where(data['Embarked']==b'C')]=0\n",
"data_Embarked[np.where(data['Embarked']==b'Q')]=1\n",
"data_Embarked[np.where(data['Embarked']==b'S')]=2\n",
"data_Embarked[np.where((data_Embarked!=0)*(data_Embarked!=1)*(data_Embarked!=2))]=3"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"features = np.array((data['Pclass'],data_Sex,data_Embarked),dtype=np.int).T\n",
"label = data['Survived']"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,\n",
" oob_score=False, random_state=None, verbose=0,\n",
" warm_start=False)"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = RandomForestClassifier(n_estimators=10)\n",
"model.fit(features, label)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"survived\n"
]
}
],
"source": [
"# prediction for a woman in first class, embarked in Southampton\n",
"ans = model.predict([[1, 2, 0]])\n",
"survival = \"survived\" if ans==1 else \"died\"\n",
"print(survival)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"died\n"
]
}
],
"source": [
"# prediction for a man in third class, embarked in Cherbourg\n",
"ans = model.predict([[3, 1, 0]])\n",
"survival = \"survived\" if ans==1 else \"died\"\n",
"print(survival)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 1
}