{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.14.2\n",
      "0.20.3\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "import numpy as np\n",
    "\n",
    "print np.__version__\n",
    "print pd.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>10</th>\n",
       "      <th>11</th>\n",
       "      <th>12</th>\n",
       "      <th>13</th>\n",
       "      <th>14</th>\n",
       "      <th>15</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>b</td>\n",
       "      <td>30.83</td>\n",
       "      <td>0.000</td>\n",
       "      <td>u</td>\n",
       "      <td>g</td>\n",
       "      <td>w</td>\n",
       "      <td>v</td>\n",
       "      <td>1.25</td>\n",
       "      <td>t</td>\n",
       "      <td>t</td>\n",
       "      <td>1</td>\n",
       "      <td>f</td>\n",
       "      <td>g</td>\n",
       "      <td>202.0</td>\n",
       "      <td>0</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>a</td>\n",
       "      <td>58.67</td>\n",
       "      <td>4.460</td>\n",
       "      <td>u</td>\n",
       "      <td>g</td>\n",
       "      <td>q</td>\n",
       "      <td>h</td>\n",
       "      <td>3.04</td>\n",
       "      <td>t</td>\n",
       "      <td>t</td>\n",
       "      <td>6</td>\n",
       "      <td>f</td>\n",
       "      <td>g</td>\n",
       "      <td>43.0</td>\n",
       "      <td>560</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>a</td>\n",
       "      <td>24.50</td>\n",
       "      <td>0.500</td>\n",
       "      <td>u</td>\n",
       "      <td>g</td>\n",
       "      <td>q</td>\n",
       "      <td>h</td>\n",
       "      <td>1.50</td>\n",
       "      <td>t</td>\n",
       "      <td>f</td>\n",
       "      <td>0</td>\n",
       "      <td>f</td>\n",
       "      <td>g</td>\n",
       "      <td>280.0</td>\n",
       "      <td>824</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>b</td>\n",
       "      <td>27.83</td>\n",
       "      <td>1.540</td>\n",
       "      <td>u</td>\n",
       "      <td>g</td>\n",
       "      <td>w</td>\n",
       "      <td>v</td>\n",
       "      <td>3.75</td>\n",
       "      <td>t</td>\n",
       "      <td>t</td>\n",
       "      <td>5</td>\n",
       "      <td>t</td>\n",
       "      <td>g</td>\n",
       "      <td>100.0</td>\n",
       "      <td>3</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>b</td>\n",
       "      <td>20.17</td>\n",
       "      <td>5.625</td>\n",
       "      <td>u</td>\n",
       "      <td>g</td>\n",
       "      <td>w</td>\n",
       "      <td>v</td>\n",
       "      <td>1.71</td>\n",
       "      <td>t</td>\n",
       "      <td>f</td>\n",
       "      <td>0</td>\n",
       "      <td>f</td>\n",
       "      <td>s</td>\n",
       "      <td>120.0</td>\n",
       "      <td>0</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  0      1      2  3  4  5  6     7  8  9   10 11 12     13   14 15\n",
       "0  b  30.83  0.000  u  g  w  v  1.25  t  t   1  f  g  202.0    0  +\n",
       "1  a  58.67  4.460  u  g  q  h  3.04  t  t   6  f  g   43.0  560  +\n",
       "2  a  24.50  0.500  u  g  q  h  1.50  t  f   0  f  g  280.0  824  +\n",
       "3  b  27.83  1.540  u  g  w  v  3.75  t  t   5  t  g  100.0    3  +\n",
       "4  b  20.17  5.625  u  g  w  v  1.71  t  f   0  f  s  120.0    0  +"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# reading data into pandas dataframe\n",
    "# also we know that the missing values in this dataset is denoted by '?', so we are telling pandas beforehand \n",
    "# to treat '?' as NaN values\n",
    "DATA_DIR = '../data'\n",
    "\n",
    "df = pd.read_table(\n",
    "                    os.path.abspath(os.path.join(DATA_DIR, 'day11/credit.csv')),\n",
    "                    sep = ',',\n",
    "                    header=None,\n",
    "                    na_values = '?'\n",
    "                   )\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     12\n",
       "1     12\n",
       "2      0\n",
       "3      6\n",
       "4      6\n",
       "5      9\n",
       "6      9\n",
       "7      0\n",
       "8      0\n",
       "9      0\n",
       "10     0\n",
       "11     0\n",
       "12     0\n",
       "13    13\n",
       "14     0\n",
       "15     0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# list out number of missing value in each of the series of the df\n",
    "df.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      object\n",
       "1     float64\n",
       "2     float64\n",
       "3      object\n",
       "4      object\n",
       "5      object\n",
       "6      object\n",
       "7     float64\n",
       "8      object\n",
       "9      object\n",
       "10      int64\n",
       "11     object\n",
       "12     object\n",
       "13    float64\n",
       "14      int64\n",
       "15     object\n",
       "dtype: object"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# just have look at the data types to get an idea of what imputation to make for any particular series\n",
    "df.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Deleting Rows that have missing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     0\n",
       "1     0\n",
       "2     0\n",
       "3     0\n",
       "4     0\n",
       "5     0\n",
       "6     0\n",
       "7     0\n",
       "8     0\n",
       "9     0\n",
       "10    0\n",
       "11    0\n",
       "12    0\n",
       "13    0\n",
       "14    0\n",
       "15    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_ = df.dropna(inplace=False)\n",
    "df_.isnull().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imputing with Mean value - (For Continuous Data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     12\n",
       "1      0\n",
       "2      0\n",
       "3      6\n",
       "4      6\n",
       "5      9\n",
       "6      9\n",
       "7      0\n",
       "8      0\n",
       "9      0\n",
       "10     0\n",
       "11     0\n",
       "12     0\n",
       "13    13\n",
       "14     0\n",
       "15     0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we will just show example of doing it with column number 1; this can be scaled to 'n' number of columns\n",
    "df[1].fillna(df[1].mean(), inplace=True)\n",
    "df.isnull().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imputing with Mode value - (For Categorical Data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      0\n",
       "1      0\n",
       "2      0\n",
       "3      6\n",
       "4      6\n",
       "5      9\n",
       "6      9\n",
       "7      0\n",
       "8      0\n",
       "9      0\n",
       "10     0\n",
       "11     0\n",
       "12     0\n",
       "13    13\n",
       "14     0\n",
       "15     0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[0].fillna(df[0].mode()[0], inplace=True)\n",
    "df.isnull().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imputing by adding one more category - (For Categorical Data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['u', 'y', nan, 'l'], dtype=object)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 3rd column can have these number of possibilities\n",
    "df[3].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      0\n",
       "1      0\n",
       "2      0\n",
       "3      0\n",
       "4      6\n",
       "5      9\n",
       "6      9\n",
       "7      0\n",
       "8      0\n",
       "9      0\n",
       "10     0\n",
       "11     0\n",
       "12     0\n",
       "13    13\n",
       "14     0\n",
       "15     0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# treating it as new category all together tells model to learn it's dependency with other features for \n",
    "# making prediction; adds to one more column; if one hot representation; else another integer value if label encoding\n",
    "df[3].fillna('UNK', inplace=True)\n",
    "df.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>10</th>\n",
       "      <th>11</th>\n",
       "      <th>12</th>\n",
       "      <th>13</th>\n",
       "      <th>14</th>\n",
       "      <th>15</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>206</th>\n",
       "      <td>a</td>\n",
       "      <td>71.58</td>\n",
       "      <td>0.0</td>\n",
       "      <td>UNK</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>f</td>\n",
       "      <td>f</td>\n",
       "      <td>0</td>\n",
       "      <td>f</td>\n",
       "      <td>p</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>270</th>\n",
       "      <td>b</td>\n",
       "      <td>37.58</td>\n",
       "      <td>0.0</td>\n",
       "      <td>UNK</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>f</td>\n",
       "      <td>f</td>\n",
       "      <td>0</td>\n",
       "      <td>f</td>\n",
       "      <td>p</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>330</th>\n",
       "      <td>b</td>\n",
       "      <td>20.42</td>\n",
       "      <td>0.0</td>\n",
       "      <td>UNK</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>f</td>\n",
       "      <td>f</td>\n",
       "      <td>0</td>\n",
       "      <td>f</td>\n",
       "      <td>p</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>456</th>\n",
       "      <td>b</td>\n",
       "      <td>34.58</td>\n",
       "      <td>0.0</td>\n",
       "      <td>UNK</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>f</td>\n",
       "      <td>f</td>\n",
       "      <td>0</td>\n",
       "      <td>f</td>\n",
       "      <td>p</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>-</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>592</th>\n",
       "      <td>b</td>\n",
       "      <td>23.17</td>\n",
       "      <td>0.0</td>\n",
       "      <td>UNK</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>f</td>\n",
       "      <td>f</td>\n",
       "      <td>0</td>\n",
       "      <td>f</td>\n",
       "      <td>p</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>622</th>\n",
       "      <td>a</td>\n",
       "      <td>25.58</td>\n",
       "      <td>0.0</td>\n",
       "      <td>UNK</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>f</td>\n",
       "      <td>f</td>\n",
       "      <td>0</td>\n",
       "      <td>f</td>\n",
       "      <td>p</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>+</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0      1    2    3    4    5    6    7  8  9   10 11 12  13  14 15\n",
       "206  a  71.58  0.0  UNK  NaN  NaN  NaN  0.0  f  f   0  f  p NaN   0  +\n",
       "270  b  37.58  0.0  UNK  NaN  NaN  NaN  0.0  f  f   0  f  p NaN   0  +\n",
       "330  b  20.42  0.0  UNK  NaN  NaN  NaN  0.0  f  f   0  f  p NaN   0  -\n",
       "456  b  34.58  0.0  UNK  NaN  NaN  NaN  0.0  f  f   0  f  p NaN   0  -\n",
       "592  b  23.17  0.0  UNK  NaN  NaN  NaN  0.0  f  f   0  f  p NaN   0  +\n",
       "622  a  25.58  0.0  UNK  NaN  NaN  NaN  0.0  f  f   0  f  p NaN   0  +"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df[3] == 'UNK']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imputing by back filling "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     0\n",
       "1     0\n",
       "2     0\n",
       "3     0\n",
       "4     6\n",
       "5     9\n",
       "6     9\n",
       "7     0\n",
       "8     0\n",
       "9     0\n",
       "10    0\n",
       "11    0\n",
       "12    0\n",
       "13    0\n",
       "14    0\n",
       "15    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we can specify a back-fill to propagate the next values backward\n",
    "df[13].fillna(method='bfill', inplace=True)\n",
    "df.isnull().sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imputing by forward filling "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     0\n",
       "1     0\n",
       "2     0\n",
       "3     0\n",
       "4     0\n",
       "5     9\n",
       "6     9\n",
       "7     0\n",
       "8     0\n",
       "9     0\n",
       "10    0\n",
       "11    0\n",
       "12    0\n",
       "13    0\n",
       "14    0\n",
       "15    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# We can specify a forward-fill to propagate the previous value forward\n",
    "df[4].fillna(method='ffill', inplace=True)\n",
    "df.isnull().sum()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}