{ "cells": [ { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Random forest" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "I use 128 trees regarding [this publication](https://www.researchgate.net/publication/230766603_How_Many_Trees_in_a_Random_Forest)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn import tree\n", "\n", "input_file = \"/Users/fede/development/data-science/titanic/titanic/train.csv\"\n", "df = pd.read_csv(input_file, header = 0)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "# Mapping values" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "d = {'male': 1, 'female': 0}\n", "df['Sex'] = df['Sex'].map(d)\n", "#(C=Cherbourg, Q=Queenstown, S=Southampton)\n", "c = {'C': 0,'Q':1,'S':2}\n", "df['Embarked'] = df['Embarked'].map(c)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "# Remove unnecessary values" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "df = df.drop('Name', 1)\n", "df = df.drop('Ticket', 1)\n", "df = df.drop('Cabin', 1)\n", "df = df.drop('PassengerId', 1)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PclassSexAgeSibSpParchFareSurvived
03122.0107.25000
11038.01071.28331
23026.0007.92501
31035.01053.10001
43135.0008.05000
531NaN008.45830
61154.00051.86250
7312.03121.07500
83027.00211.13331
92014.01030.07081
\n", "
" ], "text/plain": [ " Pclass Sex Age SibSp Parch Fare Survived\n", "0 3 1 22.0 1 0 7.2500 0\n", "1 1 0 38.0 1 0 71.2833 1\n", "2 3 0 26.0 0 0 7.9250 1\n", "3 1 0 35.0 1 0 53.1000 1\n", "4 3 1 35.0 0 0 8.0500 0\n", "5 3 1 NaN 0 0 8.4583 0\n", "6 1 1 54.0 0 0 51.8625 0\n", "7 3 1 2.0 3 1 21.0750 0\n", "8 3 0 27.0 0 2 11.1333 1\n", "9 2 0 14.0 1 0 30.0708 1" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = df[['Pclass', 'Sex', 'Age','SibSp','Parch','Fare', 'Survived']]\n", "\n", "df.head(10)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "# Build random forest trees" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", "df1 = df.astype(object).replace(np.nan, '-1')\n", "\n", "features = list(df.columns[:6])\n", "features\n", "\n", "y = df1[\"Survived\"]\n", "X = df1[features]\n", "\n", "clf = RandomForestClassifier(n_estimators = 128)\n", "clf = clf.fit(X, y)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "# Compare with test cases" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "#Read train file\n", "\n", "input_file_test = \"/Users/fede/development/data-science/titanic/titanic/test.csv\"\n", "df_test = pd.read_csv(input_file, header = 0)\n", "\n", "# map values\n", "df_test['Sex'] = df_test['Sex'].map(d)\n", "df_test['Embarked'] = df_test['Embarked'].map(c)\n", "\n", "#remove unnecessary values\n", "df_test = df_test.drop('Name', 1)\n", "df_test = df_test.drop('Ticket', 1)\n", "df_test = df_test.drop('Cabin', 1)\n", "df_test = df_test.drop('PassengerId', 1)\n", "\n", "\n", "df_test = df_test[['Pclass', 'Sex', 'Age','SibSp','Parch','Fare', 'Survived']]\n", "\n", "df_test_survived = df_test.loc[df_test['Survived'] == 1]\n", "df_test_victims = df_test.loc[df_test['Survived'] == 0]\n", "\n", "\n", "df_test_survived = df_test_survived[['Pclass', 'Sex', 'Age','SibSp','Parch','Fare']]\n", "df_test_victims = df_test_victims[['Pclass', 'Sex', 'Age','SibSp','Parch','Fare']]" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "# Survivors\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "df_test_survived = df_test_survived.astype(object).replace(np.nan, '-1')\n", "\n", "survivors = clf.predict(df_test_survived)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "data": { "text/plain": [ "3.323262839879154" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from collections import Counter\n", "\n", "total = Counter(survivors)\n", "percetange_of_error_survivors = total[0] * 100 / total[1]\n", "percetange_of_error_survivors" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "# Victims" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [ { "data": { "text/plain": [ "0.9191176470588235" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_test_victims = df_test_victims.astype(object).replace(np.nan, '-1')\n", "\n", "victims = clf.predict(df_test_victims)\n", "\n", "total = Counter(victims)\n", "percetange_of_error_victims = total[1] * 100 / total[0]\n", "percetange_of_error_victims" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }