{ "cells": [ { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.20.3\n" ] } ], "source": [ "# Imports libraries\n", "import sklearn\n", "from sklearn.cross_validation import train_test_split\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.naive_bayes import GaussianNB\n", "import pandas as pd\n", "import os\n", "\n", "print pd.__version__" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Survived</th>\n", " <th>Pclass</th>\n", " <th>Name</th>\n", " <th>Sex</th>\n", " <th>Age</th>\n", " <th>SibSp</th>\n", " <th>Parch</th>\n", " <th>Ticket</th>\n", " <th>Fare</th>\n", " <th>Cabin</th>\n", " <th>Embarked</th>\n", " </tr>\n", " <tr>\n", " <th>PassengerId</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>1</th>\n", " <td>0</td>\n", " <td>3</td>\n", " <td>Braund, Mr. Owen Harris</td>\n", " <td>male</td>\n", " <td>22.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>A/5 21171</td>\n", " <td>7.2500</td>\n", " <td>NaN</td>\n", " <td>S</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n", " <td>female</td>\n", " <td>38.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>PC 17599</td>\n", " <td>71.2833</td>\n", " <td>C85</td>\n", " <td>C</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>Heikkinen, Miss. Laina</td>\n", " <td>female</td>\n", " <td>26.0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>STON/O2. 3101282</td>\n", " <td>7.9250</td>\n", " <td>NaN</td>\n", " <td>S</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n", " <td>female</td>\n", " <td>35.0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>113803</td>\n", " <td>53.1000</td>\n", " <td>C123</td>\n", " <td>S</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>0</td>\n", " <td>3</td>\n", " <td>Allen, Mr. William Henry</td>\n", " <td>male</td>\n", " <td>35.0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>373450</td>\n", " <td>8.0500</td>\n", " <td>NaN</td>\n", " <td>S</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Survived Pclass \\\n", "PassengerId \n", "1 0 3 \n", "2 1 1 \n", "3 1 3 \n", "4 1 1 \n", "5 0 3 \n", "\n", " Name Sex Age \\\n", "PassengerId \n", "1 Braund, Mr. Owen Harris male 22.0 \n", "2 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 \n", "3 Heikkinen, Miss. Laina female 26.0 \n", "4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 \n", "5 Allen, Mr. William Henry male 35.0 \n", "\n", " SibSp Parch Ticket Fare Cabin Embarked \n", "PassengerId \n", "1 1 0 A/5 21171 7.2500 NaN S \n", "2 1 0 PC 17599 71.2833 C85 C \n", "3 0 0 STON/O2. 3101282 7.9250 NaN S \n", "4 1 0 113803 53.1000 C123 S \n", "5 0 0 373450 8.0500 NaN S " ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# reading table as dataframe\n", "DATA_DIR = '../data'\n", "\n", "df = pd.read_table(\n", " os.path.abspath(os.path.join(DATA_DIR, 'day8/titanic.csv')),\n", " sep=',', \n", " index_col='PassengerId'\n", " )\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(891, 11)" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Survived 0\n", "Pclass 0\n", "Name 0\n", "Sex 0\n", "Age 177\n", "SibSp 0\n", "Parch 0\n", "Ticket 0\n", "Fare 0\n", "Cabin 687\n", "Embarked 2\n", "dtype: int64" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# identifiying the missing values across all the colums\n", "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# dropping name, cabin, Fare, Embarked ticket columns\n", "# Name does not give any information if a person will live or not\n", "# cabin, Ticket, Fare are correlated to eachother and to PClass; so removing\n", "df.drop(['Cabin', 'Ticket', 'Name', 'Fare', 'Embarked'], axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# fill age mean to NaN value\n", "df[df['Age'].isnull()] = df['Age'].mean()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Survived 0\n", "Pclass 0\n", "Sex 0\n", "Age 0\n", "SibSp 0\n", "Parch 0\n", "dtype: int64" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check for any other NaN value\n", "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Survived</th>\n", " <th>Pclass</th>\n", " <th>Sex</th>\n", " <th>Age</th>\n", " <th>SibSp</th>\n", " <th>Parch</th>\n", " </tr>\n", " <tr>\n", " <th>PassengerId</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>1</th>\n", " <td>0.0</td>\n", " <td>3.0</td>\n", " <td>male</td>\n", " <td>22.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>female</td>\n", " <td>38.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>1.0</td>\n", " <td>3.0</td>\n", " <td>female</td>\n", " <td>26.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>female</td>\n", " <td>35.0</td>\n", " <td>1.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>0.0</td>\n", " <td>3.0</td>\n", " <td>male</td>\n", " <td>35.0</td>\n", " <td>0.0</td>\n", " <td>0.0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Survived Pclass Sex Age SibSp Parch\n", "PassengerId \n", "1 0.0 3.0 male 22.0 1.0 0.0\n", "2 1.0 1.0 female 38.0 1.0 0.0\n", "3 1.0 3.0 female 26.0 0.0 0.0\n", "4 1.0 1.0 female 35.0 1.0 0.0\n", "5 0.0 3.0 male 35.0 0.0 0.0" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(5)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# converting Sex to numbers\n", "replacements_sex = {'male': 0, 'female': 1}\n", "df['Sex'].replace(replacements_sex, inplace=True)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Survived</th>\n", " <th>Pclass</th>\n", " <th>Sex</th>\n", " <th>Age</th>\n", " <th>SibSp</th>\n", " <th>Parch</th>\n", " </tr>\n", " <tr>\n", " <th>PassengerId</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>1</th>\n", " <td>0.000000</td>\n", " <td>3.000000</td>\n", " <td>0.000000</td>\n", " <td>22.000000</td>\n", " <td>1.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>1.000000</td>\n", " <td>1.000000</td>\n", " <td>1.000000</td>\n", " <td>38.000000</td>\n", " <td>1.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>1.000000</td>\n", " <td>3.000000</td>\n", " <td>1.000000</td>\n", " <td>26.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>1.000000</td>\n", " <td>1.000000</td>\n", " <td>1.000000</td>\n", " <td>35.000000</td>\n", " <td>1.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>0.000000</td>\n", " <td>3.000000</td>\n", " <td>0.000000</td>\n", " <td>35.000000</td>\n", " <td>0.000000</td>\n", " <td>0.000000</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>29.699118</td>\n", " <td>29.699118</td>\n", " <td>29.699118</td>\n", " <td>29.699118</td>\n", " <td>29.699118</td>\n", " <td>29.699118</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Survived Pclass Sex Age SibSp Parch\n", "PassengerId \n", "1 0.000000 3.000000 0.000000 22.000000 1.000000 0.000000\n", "2 1.000000 1.000000 1.000000 38.000000 1.000000 0.000000\n", "3 1.000000 3.000000 1.000000 26.000000 0.000000 0.000000\n", "4 1.000000 1.000000 1.000000 35.000000 1.000000 0.000000\n", "5 0.000000 3.000000 0.000000 35.000000 0.000000 0.000000\n", "6 29.699118 29.699118 29.699118 29.699118 29.699118 29.699118" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(6)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# removing duplicates\n", "# redundancy does not help our model to generalize better; introduces biasness\n", "df.drop_duplicates(keep=False, inplace=True)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.0 202\n", "1.0 197\n", "Name: Survived, dtype: int64" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check if there is class imbalance ?\n", "df['Survived'].value_counts()\n", "\n", "# this would work; otherwise you can try stratified split instead of random data split" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": false }, "outputs": [], "source": [ "X = df.iloc[:, 1:].values\n", "Y = df.iloc[:, 0].values" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "((399, 5), (399,))" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.shape, Y.shape" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# ideal practice is to use test as 20% - 30% of training data\n", "# defined by test_size in train_test_split()\n", "# random_state is required to avoid sequential biasness in the data distribution\n", "def data_split(X, Y):\n", " X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.3, random_state = 10)\n", " return X_train, X_test, Y_train, Y_test\n", "\n", "X_train, X_test, Y_train, Y_test = data_split(X, Y)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(279, 5) (120, 5)\n" ] } ], "source": [ "print X_train.shape, X_test.shape" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# this class takes care for scaling the features to the scale of 0-1\n", "# we are doing the scaling with this cap because we use sigmoid activation fxn in logistic which \n", "# also has the range from 0-1\n", "class Normalizer:\n", "\n", " def __init__(self):\n", " self.sc = StandardScaler()\n", " \n", " def scale(self, X, dtype):\n", " if dtype=='train':\n", " XX = self.sc.fit_transform(X)\n", " elif dtype=='test':\n", " XX = self.sc.transform(X)\n", " else:\n", " return None\n", " return XX" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false }, "outputs": [], "source": [ "norm = Normalizer()\n", "X_train = norm.scale(X_train, 'train')\n", "X_test = norm.scale(X_test, 'test')" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false }, "outputs": [], "source": [ "class NaiveBayes:\n", " \n", " def __init__(self):\n", " self.classifier = GaussianNB()\n", "\n", " def train(self, X_train, Y_train):\n", " model = self.classifier.fit(X_train, Y_train)\n", " return model\n", " \n", " def predict(self, model, X_test):\n", " return model.predict(X_test)\n", " \n", " def evaluate(self, Y_test, Y_pred, measure):\n", " if measure=='matrix':\n", " cm = sklearn.metrics.confusion_matrix(Y_test, Y_pred, labels=[0, 1])\n", " return cm\n", " elif measure=='accuracy':\n", " return sklearn.metrics.accuracy_score(Y_test, Y_pred)*100\n", " else: return None" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "71.66666666666667\n" ] } ], "source": [ "nb = NaiveBayes()\n", "model = nb.train(X_train, Y_train)\n", "predictions = nb.predict(model, X_test)\n", "print nb.evaluate(Y_test, predictions, 'accuracy')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }