{ "cells": [ { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.20.3\n" ] } ], "source": [ "# Imports libraries\n", "import sklearn\n", "from sklearn.cross_validation import train_test_split\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.naive_bayes import GaussianNB\n", "import pandas as pd\n", "import os\n", "\n", "print pd.__version__" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
PassengerId
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", "
" ], "text/plain": [ " Survived Pclass \\\n", "PassengerId \n", "1 0 3 \n", "2 1 1 \n", "3 1 3 \n", "4 1 1 \n", "5 0 3 \n", "\n", " Name Sex Age \\\n", "PassengerId \n", "1 Braund, Mr. Owen Harris male 22.0 \n", "2 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 \n", "3 Heikkinen, Miss. Laina female 26.0 \n", "4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 \n", "5 Allen, Mr. William Henry male 35.0 \n", "\n", " SibSp Parch Ticket Fare Cabin Embarked \n", "PassengerId \n", "1 1 0 A/5 21171 7.2500 NaN S \n", "2 1 0 PC 17599 71.2833 C85 C \n", "3 0 0 STON/O2. 3101282 7.9250 NaN S \n", "4 1 0 113803 53.1000 C123 S \n", "5 0 0 373450 8.0500 NaN S " ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# reading table as dataframe\n", "DATA_DIR = '../data'\n", "\n", "df = pd.read_table(\n", " os.path.abspath(os.path.join(DATA_DIR, 'day8/titanic.csv')),\n", " sep=',', \n", " index_col='PassengerId'\n", " )\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(891, 11)" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Survived 0\n", "Pclass 0\n", "Name 0\n", "Sex 0\n", "Age 177\n", "SibSp 0\n", "Parch 0\n", "Ticket 0\n", "Fare 0\n", "Cabin 687\n", "Embarked 2\n", "dtype: int64" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# identifiying the missing values across all the colums\n", "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# dropping name, cabin, Fare, Embarked ticket columns\n", "# Name does not give any information if a person will live or not\n", "# cabin, Ticket, Fare are correlated to eachother and to PClass; so removing\n", "df.drop(['Cabin', 'Ticket', 'Name', 'Fare', 'Embarked'], axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# fill age mean to NaN value\n", "df[df['Age'].isnull()] = df['Age'].mean()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Survived 0\n", "Pclass 0\n", "Sex 0\n", "Age 0\n", "SibSp 0\n", "Parch 0\n", "dtype: int64" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check for any other NaN value\n", "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassSexAgeSibSpParch
PassengerId
10.03.0male22.01.00.0
21.01.0female38.01.00.0
31.03.0female26.00.00.0
41.01.0female35.01.00.0
50.03.0male35.00.00.0
\n", "
" ], "text/plain": [ " Survived Pclass Sex Age SibSp Parch\n", "PassengerId \n", "1 0.0 3.0 male 22.0 1.0 0.0\n", "2 1.0 1.0 female 38.0 1.0 0.0\n", "3 1.0 3.0 female 26.0 0.0 0.0\n", "4 1.0 1.0 female 35.0 1.0 0.0\n", "5 0.0 3.0 male 35.0 0.0 0.0" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(5)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# converting Sex to numbers\n", "replacements_sex = {'male': 0, 'female': 1}\n", "df['Sex'].replace(replacements_sex, inplace=True)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassSexAgeSibSpParch
PassengerId
10.0000003.0000000.00000022.0000001.0000000.000000
21.0000001.0000001.00000038.0000001.0000000.000000
31.0000003.0000001.00000026.0000000.0000000.000000
41.0000001.0000001.00000035.0000001.0000000.000000
50.0000003.0000000.00000035.0000000.0000000.000000
629.69911829.69911829.69911829.69911829.69911829.699118
\n", "
" ], "text/plain": [ " Survived Pclass Sex Age SibSp Parch\n", "PassengerId \n", "1 0.000000 3.000000 0.000000 22.000000 1.000000 0.000000\n", "2 1.000000 1.000000 1.000000 38.000000 1.000000 0.000000\n", "3 1.000000 3.000000 1.000000 26.000000 0.000000 0.000000\n", "4 1.000000 1.000000 1.000000 35.000000 1.000000 0.000000\n", "5 0.000000 3.000000 0.000000 35.000000 0.000000 0.000000\n", "6 29.699118 29.699118 29.699118 29.699118 29.699118 29.699118" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(6)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# removing duplicates\n", "# redundancy does not help our model to generalize better; introduces biasness\n", "df.drop_duplicates(keep=False, inplace=True)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.0 202\n", "1.0 197\n", "Name: Survived, dtype: int64" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check if there is class imbalance ?\n", "df['Survived'].value_counts()\n", "\n", "# this would work; otherwise you can try stratified split instead of random data split" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": false }, "outputs": [], "source": [ "X = df.iloc[:, 1:].values\n", "Y = df.iloc[:, 0].values" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "((399, 5), (399,))" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X.shape, Y.shape" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# ideal practice is to use test as 20% - 30% of training data\n", "# defined by test_size in train_test_split()\n", "# random_state is required to avoid sequential biasness in the data distribution\n", "def data_split(X, Y):\n", " X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.3, random_state = 10)\n", " return X_train, X_test, Y_train, Y_test\n", "\n", "X_train, X_test, Y_train, Y_test = data_split(X, Y)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(279, 5) (120, 5)\n" ] } ], "source": [ "print X_train.shape, X_test.shape" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# this class takes care for scaling the features to the scale of 0-1\n", "# we are doing the scaling with this cap because we use sigmoid activation fxn in logistic which \n", "# also has the range from 0-1\n", "class Normalizer:\n", "\n", " def __init__(self):\n", " self.sc = StandardScaler()\n", " \n", " def scale(self, X, dtype):\n", " if dtype=='train':\n", " XX = self.sc.fit_transform(X)\n", " elif dtype=='test':\n", " XX = self.sc.transform(X)\n", " else:\n", " return None\n", " return XX" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false }, "outputs": [], "source": [ "norm = Normalizer()\n", "X_train = norm.scale(X_train, 'train')\n", "X_test = norm.scale(X_test, 'test')" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false }, "outputs": [], "source": [ "class NaiveBayes:\n", " \n", " def __init__(self):\n", " self.classifier = GaussianNB()\n", "\n", " def train(self, X_train, Y_train):\n", " model = self.classifier.fit(X_train, Y_train)\n", " return model\n", " \n", " def predict(self, model, X_test):\n", " return model.predict(X_test)\n", " \n", " def evaluate(self, Y_test, Y_pred, measure):\n", " if measure=='matrix':\n", " cm = sklearn.metrics.confusion_matrix(Y_test, Y_pred, labels=[0, 1])\n", " return cm\n", " elif measure=='accuracy':\n", " return sklearn.metrics.accuracy_score(Y_test, Y_pred)*100\n", " else: return None" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "71.66666666666667\n" ] } ], "source": [ "nb = NaiveBayes()\n", "model = nb.train(X_train, Y_train)\n", "predictions = nb.predict(model, X_test)\n", "print nb.evaluate(Y_test, predictions, 'accuracy')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }