{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"## Random forest"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"I use 128 trees regarding [this publication](https://www.researchgate.net/publication/230766603_How_Many_Trees_in_a_Random_Forest)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn import tree\n",
"\n",
"input_file = \"/Users/fede/development/data-science/titanic/titanic/train.csv\"\n",
"df = pd.read_csv(input_file, header = 0)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PassengerId | \n",
" Survived | \n",
" Pclass | \n",
" Name | \n",
" Sex | \n",
" Age | \n",
" SibSp | \n",
" Parch | \n",
" Ticket | \n",
" Fare | \n",
" Cabin | \n",
" Embarked | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 0 | \n",
" 3 | \n",
" Braund, Mr. Owen Harris | \n",
" male | \n",
" 22.0 | \n",
" 1 | \n",
" 0 | \n",
" A/5 21171 | \n",
" 7.2500 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
" | 1 | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" Cumings, Mrs. John Bradley (Florence Briggs Th... | \n",
" female | \n",
" 38.0 | \n",
" 1 | \n",
" 0 | \n",
" PC 17599 | \n",
" 71.2833 | \n",
" C85 | \n",
" C | \n",
"
\n",
" \n",
" | 2 | \n",
" 3 | \n",
" 1 | \n",
" 3 | \n",
" Heikkinen, Miss. Laina | \n",
" female | \n",
" 26.0 | \n",
" 0 | \n",
" 0 | \n",
" STON/O2. 3101282 | \n",
" 7.9250 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
" | 3 | \n",
" 4 | \n",
" 1 | \n",
" 1 | \n",
" Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n",
" female | \n",
" 35.0 | \n",
" 1 | \n",
" 0 | \n",
" 113803 | \n",
" 53.1000 | \n",
" C123 | \n",
" S | \n",
"
\n",
" \n",
" | 4 | \n",
" 5 | \n",
" 0 | \n",
" 3 | \n",
" Allen, Mr. William Henry | \n",
" male | \n",
" 35.0 | \n",
" 0 | \n",
" 0 | \n",
" 373450 | \n",
" 8.0500 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PassengerId Survived Pclass \\\n",
"0 1 0 3 \n",
"1 2 1 1 \n",
"2 3 1 3 \n",
"3 4 1 1 \n",
"4 5 0 3 \n",
"\n",
" Name Sex Age SibSp \\\n",
"0 Braund, Mr. Owen Harris male 22.0 1 \n",
"1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
"2 Heikkinen, Miss. Laina female 26.0 0 \n",
"3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
"4 Allen, Mr. William Henry male 35.0 0 \n",
"\n",
" Parch Ticket Fare Cabin Embarked \n",
"0 0 A/5 21171 7.2500 NaN S \n",
"1 0 PC 17599 71.2833 C85 C \n",
"2 0 STON/O2. 3101282 7.9250 NaN S \n",
"3 0 113803 53.1000 C123 S \n",
"4 0 373450 8.0500 NaN S "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Mapping values"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"d = {'male': 1, 'female': 0}\n",
"df['Sex'] = df['Sex'].map(d)\n",
"#(C=Cherbourg, Q=Queenstown, S=Southampton)\n",
"c = {'C': 0,'Q':1,'S':2}\n",
"df['Embarked'] = df['Embarked'].map(c)"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Remove unnecessary values"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"df = df.drop('Name', 1)\n",
"df = df.drop('Ticket', 1)\n",
"df = df.drop('Cabin', 1)\n",
"df = df.drop('PassengerId', 1)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Pclass | \n",
" Sex | \n",
" Age | \n",
" SibSp | \n",
" Parch | \n",
" Fare | \n",
" Survived | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 3 | \n",
" 1 | \n",
" 22.0 | \n",
" 1 | \n",
" 0 | \n",
" 7.2500 | \n",
" 0 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 0 | \n",
" 38.0 | \n",
" 1 | \n",
" 0 | \n",
" 71.2833 | \n",
" 1 | \n",
"
\n",
" \n",
" | 2 | \n",
" 3 | \n",
" 0 | \n",
" 26.0 | \n",
" 0 | \n",
" 0 | \n",
" 7.9250 | \n",
" 1 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1 | \n",
" 0 | \n",
" 35.0 | \n",
" 1 | \n",
" 0 | \n",
" 53.1000 | \n",
" 1 | \n",
"
\n",
" \n",
" | 4 | \n",
" 3 | \n",
" 1 | \n",
" 35.0 | \n",
" 0 | \n",
" 0 | \n",
" 8.0500 | \n",
" 0 | \n",
"
\n",
" \n",
" | 5 | \n",
" 3 | \n",
" 1 | \n",
" NaN | \n",
" 0 | \n",
" 0 | \n",
" 8.4583 | \n",
" 0 | \n",
"
\n",
" \n",
" | 6 | \n",
" 1 | \n",
" 1 | \n",
" 54.0 | \n",
" 0 | \n",
" 0 | \n",
" 51.8625 | \n",
" 0 | \n",
"
\n",
" \n",
" | 7 | \n",
" 3 | \n",
" 1 | \n",
" 2.0 | \n",
" 3 | \n",
" 1 | \n",
" 21.0750 | \n",
" 0 | \n",
"
\n",
" \n",
" | 8 | \n",
" 3 | \n",
" 0 | \n",
" 27.0 | \n",
" 0 | \n",
" 2 | \n",
" 11.1333 | \n",
" 1 | \n",
"
\n",
" \n",
" | 9 | \n",
" 2 | \n",
" 0 | \n",
" 14.0 | \n",
" 1 | \n",
" 0 | \n",
" 30.0708 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Pclass Sex Age SibSp Parch Fare Survived\n",
"0 3 1 22.0 1 0 7.2500 0\n",
"1 1 0 38.0 1 0 71.2833 1\n",
"2 3 0 26.0 0 0 7.9250 1\n",
"3 1 0 35.0 1 0 53.1000 1\n",
"4 3 1 35.0 0 0 8.0500 0\n",
"5 3 1 NaN 0 0 8.4583 0\n",
"6 1 1 54.0 0 0 51.8625 0\n",
"7 3 1 2.0 3 1 21.0750 0\n",
"8 3 0 27.0 0 2 11.1333 1\n",
"9 2 0 14.0 1 0 30.0708 1"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df[['Pclass', 'Sex', 'Age','SibSp','Parch','Fare', 'Survived']]\n",
"\n",
"df.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Build random forest trees"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"df1 = df.astype(object).replace(np.nan, '-1')\n",
"\n",
"features = list(df.columns[:6])\n",
"features\n",
"\n",
"y = df1[\"Survived\"]\n",
"X = df1[features]\n",
"\n",
"clf = RandomForestClassifier(n_estimators = 128)\n",
"clf = clf.fit(X, y)"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Compare with test cases"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"#Read train file\n",
"\n",
"input_file_test = \"/Users/fede/development/data-science/titanic/titanic/test.csv\"\n",
"df_test = pd.read_csv(input_file, header = 0)\n",
"\n",
"# map values\n",
"df_test['Sex'] = df_test['Sex'].map(d)\n",
"df_test['Embarked'] = df_test['Embarked'].map(c)\n",
"\n",
"#remove unnecessary values\n",
"df_test = df_test.drop('Name', 1)\n",
"df_test = df_test.drop('Ticket', 1)\n",
"df_test = df_test.drop('Cabin', 1)\n",
"df_test = df_test.drop('PassengerId', 1)\n",
"\n",
"\n",
"df_test = df_test[['Pclass', 'Sex', 'Age','SibSp','Parch','Fare', 'Survived']]\n",
"\n",
"df_test_survived = df_test.loc[df_test['Survived'] == 1]\n",
"df_test_victims = df_test.loc[df_test['Survived'] == 0]\n",
"\n",
"\n",
"df_test_survived = df_test_survived[['Pclass', 'Sex', 'Age','SibSp','Parch','Fare']]\n",
"df_test_victims = df_test_victims[['Pclass', 'Sex', 'Age','SibSp','Parch','Fare']]"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Survivors\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"df_test_survived = df_test_survived.astype(object).replace(np.nan, '-1')\n",
"\n",
"survivors = clf.predict(df_test_survived)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"3.323262839879154"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from collections import Counter\n",
"\n",
"total = Counter(survivors)\n",
"percetange_of_error_survivors = total[0] * 100 / total[1]\n",
"percetange_of_error_survivors"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Victims"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"0.9191176470588235"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_test_victims = df_test_victims.astype(object).replace(np.nan, '-1')\n",
"\n",
"victims = clf.predict(df_test_victims)\n",
"\n",
"total = Counter(victims)\n",
"percetange_of_error_victims = total[1] * 100 / total[0]\n",
"percetange_of_error_victims"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}