{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Supervised Machine Learning basics: Titanic example" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Florent Leclercq,
\n", "Imperial Centre for Inference and Cosmology, Imperial College London,
\n", "florent.leclercq@polytechnique.org" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from sklearn.ensemble import RandomForestClassifier" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load training data set" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# data set available at this address: https://www.kaggle.com/c/titanic/data\n", "# (version slightly modified to be conveniently loaded with numpy)\n", "dtype = {'names':('PassengerId','Survived','Pclass','Name','Sex',\n", " 'Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked'),\n", " 'formats': ('i4','i4','i4','S20','S6','S20','i4','i4','S20','f8','S20','S20')}\n", "data = np.loadtxt(\"data/titanic.csv\", dtype=dtype, delimiter=\";\", comments=\"#\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Data dictionary\n", "\n", "| Variable | Definition | Key |\n", "|----------|--------------------------------------------|------------------------------------------------|\n", "| Survived | Survival | 0 = No, 1 = Yes |\n", "| Pclass | Ticket class | 1 = 1st, 2 = 2nd, 3 = 3rd |\n", "| Sex | Sex | |\n", "| Age | Age in years | |\n", "| SibSp | # of siblings / spouses aboard the Titanic | |\n", "| Parch | # of parents / children aboard the Titanic | |\n", "| Ticket | Ticket number | |\n", "| Fare | Passenger fare | |\n", "| Cabin | Cabin number | |\n", "| Embarked | Port of Embarkation | C = Cherbourg, Q = Queenstown, S = Southampton |" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "(501, 0, 3, b'\"Calic, Mr. Petar\"', b'male', b'17', 0, 0, b'315086', 8.6625, b'', b'S')" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[500]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(601, 1, 2, b'\"Jacobsohn, Mrs. Sid', b'female', b'24', 2, 1, b'243847', 27., b'', b'S')" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[600]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def perform_splitting(condition):\n", " selected_passengers = data[np.where(condition)]\n", " Nsplit = selected_passengers.size\n", " Nsurvived_split = np.sum(selected_passengers['Survived']==1)\n", " Ndied_split = np.sum(selected_passengers['Survived']==0)\n", " return Nsurvived_split, Ndied_split, Nsplit\n", " \n", "def entropy(Nsurvived, Ndied, Ntot):\n", " assert(Nsurvived + Ndied == Ntot)\n", " return -Nsurvived/Ntot*np.log2(Nsurvived/Ntot) -Ndied/Ntot*np.log2(Ndied/Ntot)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Parent Entropy" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ndied=549, Nsurvived=342, Ntot=891\n", "H_parent=0.9607079018756469\n" ] } ], "source": [ "Ntot = data['Survived'].size\n", "Nsurvived = np.sum(data['Survived']==1)\n", "Ndied = np.sum(data['Survived']==0)\n", "H_parent = entropy(Nsurvived, Ndied, Ntot)\n", "print(\"Ndied={}, Nsurvived={}, Ntot={}\".format(Ndied,Nsurvived,Ntot))\n", "print(\"H_parent={}\".format(H_parent))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Information gain: ticket class" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ndied_first=80, Nsurvived_first=136, Nfirst=216\n", "H_first=0.9509560484549725\n" ] } ], "source": [ "Nsurvived_first, Ndied_first, Nfirst = perform_splitting(data['Pclass']==1)\n", "H_first = entropy(Nsurvived_first, Ndied_first, Nfirst)\n", "print(\"Ndied_first={}, Nsurvived_first={}, Nfirst={}\".format(Ndied_first,Nsurvived_first,Nfirst))\n", "print(\"H_first={}\".format(H_first))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ndied_second=97, Nsurvived_second=87, Nsecond=184\n", "H_second=0.9978683156711936\n" ] } ], "source": [ "Nsurvived_second, Ndied_second, Nsecond = perform_splitting(data['Pclass']==2)\n", "H_second = entropy(Nsurvived_second, Ndied_second, Nsecond)\n", "print(\"Ndied_second={}, Nsurvived_second={}, Nsecond={}\".format(Ndied_second,Nsurvived_second,Nsecond))\n", "print(\"H_second={}\".format(H_second))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ndied_third=372, Nsurvived_third=119, Nthird=491\n", "H_third=0.7989470522661535\n" ] } ], "source": [ "Nsurvived_third, Ndied_third, Nthird = perform_splitting(data['Pclass']==3)\n", "H_third = entropy(Nsurvived_third, Ndied_third, Nthird)\n", "print(\"Ndied_third={}, Nsurvived_third={}, Nthird={}\".format(Ndied_third,Nsurvived_third,Nthird))\n", "print(\"H_third={}\".format(H_third))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "IG_class=0.0838310452960116\n" ] } ], "source": [ "H_class = Nfirst/Ntot*H_first + Nsecond/Ntot*H_second + Nthird/Ntot*H_third\n", "IG_class = H_parent - H_class\n", "print(\"IG_class={}\".format(IG_class))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Information gain: sex" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ndied_male=468, Nsurvived_male=109, Nmale=577\n", "H_male=0.6991817891208407\n" ] } ], "source": [ "Nsurvived_male, Ndied_male, Nmale = perform_splitting(data['Sex']==b'male')\n", "H_male = entropy(Nsurvived_male, Ndied_male, Nmale)\n", "print(\"Ndied_male={}, Nsurvived_male={}, Nmale={}\".format(Ndied_male,Nsurvived_male,Nmale))\n", "print(\"H_male={}\".format(H_male))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ndied_female=81, Nsurvived_female=233, Nfemale=314\n", "H_female=0.8236550739295191\n" ] } ], "source": [ "Nsurvived_female, Ndied_female, Nfemale = perform_splitting(data['Sex']==b'female')\n", "H_female = entropy(Nsurvived_female, Ndied_female, Nfemale)\n", "print(\"Ndied_female={}, Nsurvived_female={}, Nfemale={}\".format(Ndied_female,Nsurvived_female,Nfemale))\n", "print(\"H_female={}\".format(H_female))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "IG_sex=0.2176601066606142\n" ] } ], "source": [ "H_sex = Nmale/Ntot*H_male + Nfemale/Ntot*H_female\n", "IG_sex = H_parent - H_sex\n", "print(\"IG_sex={}\".format(IG_sex))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Information gain: port of embarkation" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ndied_cherbourg=75, Nsurvived_cherbourg=93, Ncherbourg=168\n", "H_cherbourg=0.9917033083725818\n" ] } ], "source": [ "Nsurvived_cherbourg, Ndied_cherbourg, Ncherbourg = perform_splitting(data['Embarked']==b'C')\n", "H_cherbourg = entropy(Nsurvived_cherbourg, Ndied_cherbourg, Ncherbourg)\n", "print(\"Ndied_cherbourg={}, Nsurvived_cherbourg={}, Ncherbourg={}\"\n", " .format(Ndied_cherbourg,Nsurvived_cherbourg,Ncherbourg))\n", "print(\"H_cherbourg={}\".format(H_cherbourg))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ndied_queenstown=47, Nsurvived_queenstown=30, Nqueenstown=77\n", "H_queenstown=0.9645476589143234\n" ] } ], "source": [ "Nsurvived_queenstown, Ndied_queenstown, Nqueenstown = perform_splitting(data['Embarked']==b'Q')\n", "H_queenstown = entropy(Nsurvived_queenstown, Ndied_queenstown, Nqueenstown)\n", "print(\"Ndied_queenstown={}, Nsurvived_queenstown={}, Nqueenstown={}\"\n", " .format(Ndied_queenstown,Nsurvived_queenstown,Nqueenstown))\n", "print(\"H_queenstown={}\".format(H_queenstown))" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ndied_southampton=427, Nsurvived_southampton=217, Nsouthampton=644\n", "H_southampton=0.921876486346913\n" ] } ], "source": [ "Nsurvived_southampton, Ndied_southampton, Nsouthampton = perform_splitting(data['Embarked']==b'S')\n", "H_southampton = entropy(Nsurvived_southampton, Ndied_southampton, Nsouthampton)\n", "print(\"Ndied_southampton={}, Nsurvived_southampton={}, Nsouthampton={}\"\n", " .format(Ndied_southampton,Nsurvived_southampton,Nsouthampton))\n", "print(\"H_southampton={}\".format(H_southampton))" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "IG_embarked=0.024047090707960517\n" ] } ], "source": [ "H_embarked = Ncherbourg/Ntot*H_cherbourg + Nqueenstown/Ntot*H_queenstown + Nsouthampton/Ntot*H_southampton\n", "IG_embarked = H_parent - H_embarked\n", "print(\"IG_embarked={}\".format(IG_embarked))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Off-the-shelf machine learning algorithm" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "data_Sex=np.zeros(len(data),dtype=np.int)\n", "data_Sex[np.where(data['Sex']==b'male')]=1\n", "data_Sex[np.where(data['Sex']==b'female')]=2\n", "data_Embarked=3*np.ones(len(data),dtype=np.int)\n", "data_Embarked[np.where(data['Embarked']==b'C')]=0\n", "data_Embarked[np.where(data['Embarked']==b'Q')]=1\n", "data_Embarked[np.where(data['Embarked']==b'S')]=2\n", "data_Embarked[np.where((data_Embarked!=0)*(data_Embarked!=1)*(data_Embarked!=2))]=3" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "features = np.array((data['Pclass'],data_Sex,data_Embarked),dtype=np.int).T\n", "label = data['Survived']" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", " min_impurity_decrease=0.0, min_impurity_split=None,\n", " min_samples_leaf=1, min_samples_split=2,\n", " min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,\n", " oob_score=False, random_state=None, verbose=0,\n", " warm_start=False)" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = RandomForestClassifier(n_estimators=10)\n", "model.fit(features, label)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "survived\n" ] } ], "source": [ "# prediction for a woman in first class, embarked in Southampton\n", "ans = model.predict([[1, 2, 0]])\n", "survival = \"survived\" if ans==1 else \"died\"\n", "print(survival)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "died\n" ] } ], "source": [ "# prediction for a man in third class, embarked in Cherbourg\n", "ans = model.predict([[3, 1, 0]])\n", "survival = \"survived\" if ans==1 else \"died\"\n", "print(survival)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 1 }