{
 "metadata": {
  "name": "",
  "signature": "sha256:85afef5cf90a62abad47b2010737fca1c44e468a30d78f409c1270a5d8ca0ac6"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Example of cleaning up data in Pandas\n",
      "This example is actually the exercises provided by the following link:\n",
      "<a href=\"http://nbviewer.ipython.org/urls/raw.github.com/nborwankar/LearnDataScience/master/notebooks/A2.%20Linear%20Regression%20-%20Data%20Exploration%20-%20Lending%20Club.ipynb\" target=\"_blank\">A2. Linear Regression - Data Exploration - Lending Club</a>, which is part of the site for <a href=\"https://github.com/nborwankar/LearnDataScience\" target=\"_blank\">Learn Data Science</a>.\n"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "#### First import ipython settings"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import pandas as pd\n",
      "import matplotlib.pyplot as plt\n",
      "%matplotlib inline"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 43
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "#### Import the data to be cleaned and inspect them"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df = pd.read_csv('https://cdn.rawgit.com/benedict-chan/pytest/master/datasets/loansData.csv')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 44
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>Amount.Requested</th>\n",
        "      <th>Amount.Funded.By.Investors</th>\n",
        "      <th>Interest.Rate</th>\n",
        "      <th>Loan.Length</th>\n",
        "      <th>Loan.Purpose</th>\n",
        "      <th>Debt.To.Income.Ratio</th>\n",
        "      <th>State</th>\n",
        "      <th>Home.Ownership</th>\n",
        "      <th>Monthly.Income</th>\n",
        "      <th>FICO.Range</th>\n",
        "      <th>Open.CREDIT.Lines</th>\n",
        "      <th>Revolving.CREDIT.Balance</th>\n",
        "      <th>Inquiries.in.the.Last.6.Months</th>\n",
        "      <th>Employment.Length</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>81174</th>\n",
        "      <td> 20000</td>\n",
        "      <td> 20000</td>\n",
        "      <td>  8.90%</td>\n",
        "      <td> 36 months</td>\n",
        "      <td> debt_consolidation</td>\n",
        "      <td> 14.90%</td>\n",
        "      <td> SC</td>\n",
        "      <td> MORTGAGE</td>\n",
        "      <td>  6541.67</td>\n",
        "      <td> 735-739</td>\n",
        "      <td> 14</td>\n",
        "      <td> 14272</td>\n",
        "      <td> 2</td>\n",
        "      <td> &lt; 1 year</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>99592</th>\n",
        "      <td> 19200</td>\n",
        "      <td> 19200</td>\n",
        "      <td> 12.12%</td>\n",
        "      <td> 36 months</td>\n",
        "      <td> debt_consolidation</td>\n",
        "      <td> 28.36%</td>\n",
        "      <td> TX</td>\n",
        "      <td> MORTGAGE</td>\n",
        "      <td>  4583.33</td>\n",
        "      <td> 715-719</td>\n",
        "      <td> 12</td>\n",
        "      <td> 11140</td>\n",
        "      <td> 1</td>\n",
        "      <td>  2 years</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>80059</th>\n",
        "      <td> 35000</td>\n",
        "      <td> 35000</td>\n",
        "      <td> 21.98%</td>\n",
        "      <td> 60 months</td>\n",
        "      <td> debt_consolidation</td>\n",
        "      <td> 23.81%</td>\n",
        "      <td> CA</td>\n",
        "      <td> MORTGAGE</td>\n",
        "      <td> 11500.00</td>\n",
        "      <td> 690-694</td>\n",
        "      <td> 14</td>\n",
        "      <td> 21977</td>\n",
        "      <td> 1</td>\n",
        "      <td>  2 years</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>15825</th>\n",
        "      <td> 10000</td>\n",
        "      <td>  9975</td>\n",
        "      <td>  9.99%</td>\n",
        "      <td> 36 months</td>\n",
        "      <td> debt_consolidation</td>\n",
        "      <td> 14.30%</td>\n",
        "      <td> KS</td>\n",
        "      <td> MORTGAGE</td>\n",
        "      <td>  3833.33</td>\n",
        "      <td> 695-699</td>\n",
        "      <td> 10</td>\n",
        "      <td>  9346</td>\n",
        "      <td> 0</td>\n",
        "      <td>  5 years</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>33182</th>\n",
        "      <td> 12000</td>\n",
        "      <td> 12000</td>\n",
        "      <td> 11.71%</td>\n",
        "      <td> 36 months</td>\n",
        "      <td>        credit_card</td>\n",
        "      <td> 18.78%</td>\n",
        "      <td> NJ</td>\n",
        "      <td>     RENT</td>\n",
        "      <td>  3195.00</td>\n",
        "      <td> 695-699</td>\n",
        "      <td> 11</td>\n",
        "      <td> 14469</td>\n",
        "      <td> 0</td>\n",
        "      <td>  9 years</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 45,
       "text": [
        "       Amount.Requested  Amount.Funded.By.Investors Interest.Rate Loan.Length  \\\n",
        "81174             20000                       20000         8.90%   36 months   \n",
        "99592             19200                       19200        12.12%   36 months   \n",
        "80059             35000                       35000        21.98%   60 months   \n",
        "15825             10000                        9975         9.99%   36 months   \n",
        "33182             12000                       12000        11.71%   36 months   \n",
        "\n",
        "             Loan.Purpose Debt.To.Income.Ratio State Home.Ownership  \\\n",
        "81174  debt_consolidation               14.90%    SC       MORTGAGE   \n",
        "99592  debt_consolidation               28.36%    TX       MORTGAGE   \n",
        "80059  debt_consolidation               23.81%    CA       MORTGAGE   \n",
        "15825  debt_consolidation               14.30%    KS       MORTGAGE   \n",
        "33182         credit_card               18.78%    NJ           RENT   \n",
        "\n",
        "       Monthly.Income FICO.Range  Open.CREDIT.Lines  Revolving.CREDIT.Balance  \\\n",
        "81174         6541.67    735-739                 14                     14272   \n",
        "99592         4583.33    715-719                 12                     11140   \n",
        "80059        11500.00    690-694                 14                     21977   \n",
        "15825         3833.33    695-699                 10                      9346   \n",
        "33182         3195.00    695-699                 11                     14469   \n",
        "\n",
        "       Inquiries.in.the.Last.6.Months Employment.Length  \n",
        "81174                               2          < 1 year  \n",
        "99592                               1           2 years  \n",
        "80059                               1           2 years  \n",
        "15825                               0           5 years  \n",
        "33182                               0           9 years  "
       ]
      }
     ],
     "prompt_number": 45
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "df.dtypes"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 46,
       "text": [
        "Amount.Requested                    int64\n",
        "Amount.Funded.By.Investors        float64\n",
        "Interest.Rate                      object\n",
        "Loan.Length                        object\n",
        "Loan.Purpose                       object\n",
        "Debt.To.Income.Ratio               object\n",
        "State                              object\n",
        "Home.Ownership                     object\n",
        "Monthly.Income                    float64\n",
        "FICO.Range                         object\n",
        "Open.CREDIT.Lines                 float64\n",
        "Revolving.CREDIT.Balance          float64\n",
        "Inquiries.in.the.Last.6.Months    float64\n",
        "Employment.Length                  object\n",
        "dtype: object"
       ]
      }
     ],
     "prompt_number": 46
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "**We will find out that several columns are not regonized as numeric.**"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Fix the Interest.Rate and Debt.To.Income.Ratio\n",
      "First check whether any of the rate doesn't contains %"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "len(df[ ( df['Interest.Rate'].str.contains('%') == False) ] ) , len(df[ ( df['Debt.To.Income.Ratio'].str.contains('%') == False) ] )"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 47,
       "text": [
        "(0, 0)"
       ]
      }
     ],
     "prompt_number": 47
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "remove_percent_converter = lambda x: float(x.replace('%', ''))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 48
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Fix Loan.Length\n",
      "First check whether any of the loan doesn't contains \"months\"."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "len(df[df['Loan.Length'].str.contains('months') == False])"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 49,
       "text": [
        "0"
       ]
      }
     ],
     "prompt_number": 49
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "remove_month_converter = lambda x: float(x.replace('months', ''))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 50
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Apply the converters"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "clean_df = pd.read_csv('https://cdn.rawgit.com/benedict-chan/pytest/master/datasets/loansData.csv'\n",
      "\t, converters={'Interest.Rate': remove_percent_converter\n",
      "\t,'Debt.To.Income.Ratio': remove_percent_converter\n",
      "\t, 'Loan.Length': remove_month_converter})"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 51
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Fix FICO.Score\n",
      "For FICO, we are trying to get the lower part of the Range as our preferred value.\n",
      "We will do it by:\n",
      "1. Creating a function to get the lower part of the Range\n",
      "2. Create a new column to store this value"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "fico_function = lambda x: float(x.split('-')[0])\n",
      "clean_df['FICO.Score'] = clean_df['FICO.Range'].map(fico_function)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 52
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "### Finalize the result"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "#### Rename column"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "clean_df = clean_df.rename(columns={'Amount.Requested':'Loan.Amount'})"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 53
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "#### Create a new index column"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "clean_df['NewIndex'] = pd.Series(range(1, clean_df['Amount.Funded.By.Investors'].count()+1), index=clean_df.index)\n",
      "clean_df = clean_df.set_index('NewIndex')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 54
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "#### Filter only the columns we need"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "final_columns = [\"Interest.Rate\",\"FICO.Score\",\"Loan.Length\",\"Monthly.Income\",\"Loan.Amount\"]\n",
      "final_df = clean_df[final_columns]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 55
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "#### Remove extremes (strange monthly income)"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "final_df = final_df[final_df['Monthly.Income'] < 100000]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 56
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Comparing to the expected results\n",
      "Getting the expected results and format it"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "checking_df = pd.read_csv('https://cdn.rawgit.com/benedict-chan/pytest/master/datasets/loanf.csv')\n",
      "checking_df = checking_df.sort_index()\n",
      "checking_df.index.name = 'NewIndex'"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 57
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Some magic\n",
      "To see the difference of our cleaned data to the expected one."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "ne_stacked = pd.concat([final_df, checking_df])\n",
      "ne_stacked = ne_stacked.reset_index(drop=True)\n",
      "df_gpby = ne_stacked.groupby(list(ne_stacked.columns))\n",
      "\n",
      "idx = [x[0] for x in df_gpby.groups.values() if len(x) == 1]\n",
      "\n",
      "ne_stacked.reindex(idx)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>Interest.Rate</th>\n",
        "      <th>FICO.Score</th>\n",
        "      <th>Loan.Length</th>\n",
        "      <th>Monthly.Income</th>\n",
        "      <th>Loan.Amount</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>4349</th>\n",
        "      <td> 7.62</td>\n",
        "      <td> 805</td>\n",
        "      <td> 36</td>\n",
        "      <td> 102750</td>\n",
        "      <td> 33000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2864</th>\n",
        "      <td> 7.43</td>\n",
        "      <td> 800</td>\n",
        "      <td> 36</td>\n",
        "      <td>    NaN</td>\n",
        "      <td>  5000</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 62,
       "text": [
        "      Interest.Rate  FICO.Score  Loan.Length  Monthly.Income  Loan.Amount\n",
        "4349           7.62         805           36          102750        33000\n",
        "2864           7.43         800           36             NaN         5000"
       ]
      }
     ],
     "prompt_number": 62
    }
   ],
   "metadata": {}
  }
 ]
}