{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.14.2\n", "0.20.3\n" ] } ], "source": [ "import pandas as pd\n", "import os\n", "import numpy as np\n", "\n", "print np.__version__\n", "print pd.__version__" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>0</th>\n", " <th>1</th>\n", " <th>2</th>\n", " <th>3</th>\n", " <th>4</th>\n", " <th>5</th>\n", " <th>6</th>\n", " <th>7</th>\n", " <th>8</th>\n", " <th>9</th>\n", " <th>10</th>\n", " <th>11</th>\n", " <th>12</th>\n", " <th>13</th>\n", " <th>14</th>\n", " <th>15</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>b</td>\n", " <td>30.83</td>\n", " <td>0.000</td>\n", " <td>u</td>\n", " <td>g</td>\n", " <td>w</td>\n", " <td>v</td>\n", " <td>1.25</td>\n", " <td>t</td>\n", " <td>t</td>\n", " <td>1</td>\n", " <td>f</td>\n", " <td>g</td>\n", " <td>202.0</td>\n", " <td>0</td>\n", " <td>+</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>a</td>\n", " <td>58.67</td>\n", " <td>4.460</td>\n", " <td>u</td>\n", " <td>g</td>\n", " <td>q</td>\n", " <td>h</td>\n", " <td>3.04</td>\n", " <td>t</td>\n", " <td>t</td>\n", " <td>6</td>\n", " <td>f</td>\n", " <td>g</td>\n", " <td>43.0</td>\n", " <td>560</td>\n", " <td>+</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>a</td>\n", " <td>24.50</td>\n", " <td>0.500</td>\n", " <td>u</td>\n", " <td>g</td>\n", " <td>q</td>\n", " <td>h</td>\n", " <td>1.50</td>\n", " <td>t</td>\n", " <td>f</td>\n", " <td>0</td>\n", " <td>f</td>\n", " <td>g</td>\n", " <td>280.0</td>\n", " <td>824</td>\n", " <td>+</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>b</td>\n", " <td>27.83</td>\n", " <td>1.540</td>\n", " <td>u</td>\n", " <td>g</td>\n", " <td>w</td>\n", " <td>v</td>\n", " <td>3.75</td>\n", " <td>t</td>\n", " <td>t</td>\n", " <td>5</td>\n", " <td>t</td>\n", " <td>g</td>\n", " <td>100.0</td>\n", " <td>3</td>\n", " <td>+</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>b</td>\n", " <td>20.17</td>\n", " <td>5.625</td>\n", " <td>u</td>\n", " <td>g</td>\n", " <td>w</td>\n", " <td>v</td>\n", " <td>1.71</td>\n", " <td>t</td>\n", " <td>f</td>\n", " <td>0</td>\n", " <td>f</td>\n", " <td>s</td>\n", " <td>120.0</td>\n", " <td>0</td>\n", " <td>+</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15\n", "0 b 30.83 0.000 u g w v 1.25 t t 1 f g 202.0 0 +\n", "1 a 58.67 4.460 u g q h 3.04 t t 6 f g 43.0 560 +\n", "2 a 24.50 0.500 u g q h 1.50 t f 0 f g 280.0 824 +\n", "3 b 27.83 1.540 u g w v 3.75 t t 5 t g 100.0 3 +\n", "4 b 20.17 5.625 u g w v 1.71 t f 0 f s 120.0 0 +" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# reading data into pandas dataframe\n", "# also we know that the missing values in this dataset is denoted by '?', so we are telling pandas beforehand \n", "# to treat '?' as NaN values\n", "DATA_DIR = '../data'\n", "\n", "df = pd.read_table(\n", " os.path.abspath(os.path.join(DATA_DIR, 'day11/credit.csv')),\n", " sep = ',',\n", " header=None,\n", " na_values = '?'\n", " )\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 12\n", "1 12\n", "2 0\n", "3 6\n", "4 6\n", "5 9\n", "6 9\n", "7 0\n", "8 0\n", "9 0\n", "10 0\n", "11 0\n", "12 0\n", "13 13\n", "14 0\n", "15 0\n", "dtype: int64" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# list out number of missing value in each of the series of the df\n", "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 object\n", "1 float64\n", "2 float64\n", "3 object\n", "4 object\n", "5 object\n", "6 object\n", "7 float64\n", "8 object\n", "9 object\n", "10 int64\n", "11 object\n", "12 object\n", "13 float64\n", "14 int64\n", "15 object\n", "dtype: object" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# just have look at the data types to get an idea of what imputation to make for any particular series\n", "df.dtypes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Deleting Rows that have missing values" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 0\n", "1 0\n", "2 0\n", "3 0\n", "4 0\n", "5 0\n", "6 0\n", "7 0\n", "8 0\n", "9 0\n", "10 0\n", "11 0\n", "12 0\n", "13 0\n", "14 0\n", "15 0\n", "dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_ = df.dropna(inplace=False)\n", "df_.isnull().sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imputing with Mean value - (For Continuous Data)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 12\n", "1 0\n", "2 0\n", "3 6\n", "4 6\n", "5 9\n", "6 9\n", "7 0\n", "8 0\n", "9 0\n", "10 0\n", "11 0\n", "12 0\n", "13 13\n", "14 0\n", "15 0\n", "dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we will just show example of doing it with column number 1; this can be scaled to 'n' number of columns\n", "df[1].fillna(df[1].mean(), inplace=True)\n", "df.isnull().sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imputing with Mode value - (For Categorical Data)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 0\n", "1 0\n", "2 0\n", "3 6\n", "4 6\n", "5 9\n", "6 9\n", "7 0\n", "8 0\n", "9 0\n", "10 0\n", "11 0\n", "12 0\n", "13 13\n", "14 0\n", "15 0\n", "dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[0].fillna(df[0].mode()[0], inplace=True)\n", "df.isnull().sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imputing by adding one more category - (For Categorical Data)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array(['u', 'y', nan, 'l'], dtype=object)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 3rd column can have these number of possibilities\n", "df[3].unique()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 0\n", "1 0\n", "2 0\n", "3 0\n", "4 6\n", "5 9\n", "6 9\n", "7 0\n", "8 0\n", "9 0\n", "10 0\n", "11 0\n", "12 0\n", "13 13\n", "14 0\n", "15 0\n", "dtype: int64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# treating it as new category all together tells model to learn it's dependency with other features for \n", "# making prediction; adds to one more column; if one hot representation; else another integer value if label encoding\n", "df[3].fillna('UNK', inplace=True)\n", "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>0</th>\n", " <th>1</th>\n", " <th>2</th>\n", " <th>3</th>\n", " <th>4</th>\n", " <th>5</th>\n", " <th>6</th>\n", " <th>7</th>\n", " <th>8</th>\n", " <th>9</th>\n", " <th>10</th>\n", " <th>11</th>\n", " <th>12</th>\n", " <th>13</th>\n", " <th>14</th>\n", " <th>15</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>206</th>\n", " <td>a</td>\n", " <td>71.58</td>\n", " <td>0.0</td>\n", " <td>UNK</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>0.0</td>\n", " <td>f</td>\n", " <td>f</td>\n", " <td>0</td>\n", " <td>f</td>\n", " <td>p</td>\n", " <td>NaN</td>\n", " <td>0</td>\n", " <td>+</td>\n", " </tr>\n", " <tr>\n", " <th>270</th>\n", " <td>b</td>\n", " <td>37.58</td>\n", " <td>0.0</td>\n", " <td>UNK</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>0.0</td>\n", " <td>f</td>\n", " <td>f</td>\n", " <td>0</td>\n", " <td>f</td>\n", " <td>p</td>\n", " <td>NaN</td>\n", " <td>0</td>\n", " <td>+</td>\n", " </tr>\n", " <tr>\n", " <th>330</th>\n", " <td>b</td>\n", " <td>20.42</td>\n", " <td>0.0</td>\n", " <td>UNK</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>0.0</td>\n", " <td>f</td>\n", " <td>f</td>\n", " <td>0</td>\n", " <td>f</td>\n", " <td>p</td>\n", " <td>NaN</td>\n", " <td>0</td>\n", " <td>-</td>\n", " </tr>\n", " <tr>\n", " <th>456</th>\n", " <td>b</td>\n", " <td>34.58</td>\n", " <td>0.0</td>\n", " <td>UNK</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>0.0</td>\n", " <td>f</td>\n", " <td>f</td>\n", " <td>0</td>\n", " <td>f</td>\n", " <td>p</td>\n", " <td>NaN</td>\n", " <td>0</td>\n", " <td>-</td>\n", " </tr>\n", " <tr>\n", " <th>592</th>\n", " <td>b</td>\n", " <td>23.17</td>\n", " <td>0.0</td>\n", " <td>UNK</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>0.0</td>\n", " <td>f</td>\n", " <td>f</td>\n", " <td>0</td>\n", " <td>f</td>\n", " <td>p</td>\n", " <td>NaN</td>\n", " <td>0</td>\n", " <td>+</td>\n", " </tr>\n", " <tr>\n", " <th>622</th>\n", " <td>a</td>\n", " <td>25.58</td>\n", " <td>0.0</td>\n", " <td>UNK</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>0.0</td>\n", " <td>f</td>\n", " <td>f</td>\n", " <td>0</td>\n", " <td>f</td>\n", " <td>p</td>\n", " <td>NaN</td>\n", " <td>0</td>\n", " <td>+</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15\n", "206 a 71.58 0.0 UNK NaN NaN NaN 0.0 f f 0 f p NaN 0 +\n", "270 b 37.58 0.0 UNK NaN NaN NaN 0.0 f f 0 f p NaN 0 +\n", "330 b 20.42 0.0 UNK NaN NaN NaN 0.0 f f 0 f p NaN 0 -\n", "456 b 34.58 0.0 UNK NaN NaN NaN 0.0 f f 0 f p NaN 0 -\n", "592 b 23.17 0.0 UNK NaN NaN NaN 0.0 f f 0 f p NaN 0 +\n", "622 a 25.58 0.0 UNK NaN NaN NaN 0.0 f f 0 f p NaN 0 +" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df[3] == 'UNK']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imputing by back filling " ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 0\n", "1 0\n", "2 0\n", "3 0\n", "4 6\n", "5 9\n", "6 9\n", "7 0\n", "8 0\n", "9 0\n", "10 0\n", "11 0\n", "12 0\n", "13 0\n", "14 0\n", "15 0\n", "dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we can specify a back-fill to propagate the next values backward\n", "df[13].fillna(method='bfill', inplace=True)\n", "df.isnull().sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imputing by forward filling " ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 0\n", "1 0\n", "2 0\n", "3 0\n", "4 0\n", "5 9\n", "6 9\n", "7 0\n", "8 0\n", "9 0\n", "10 0\n", "11 0\n", "12 0\n", "13 0\n", "14 0\n", "15 0\n", "dtype: int64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# We can specify a forward-fill to propagate the previous value forward\n", "df[4].fillna(method='ffill', inplace=True)\n", "df.isnull().sum()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }