{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "import lightgbm as lgb\n",
    "import scipy.stats\n",
    "\n",
    "datapath='Data/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import data\n",
    "train_df=pd.read_csv(datapath+'train_u6lujuX_CVtuZ9i.csv')\n",
    "test_df=pd.read_csv(datapath+'test_Y3wMUE5_7gLdaTN.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Loan_ID</th>\n",
       "      <th>Gender</th>\n",
       "      <th>Married</th>\n",
       "      <th>Dependents</th>\n",
       "      <th>Education</th>\n",
       "      <th>Self_Employed</th>\n",
       "      <th>ApplicantIncome</th>\n",
       "      <th>CoapplicantIncome</th>\n",
       "      <th>LoanAmount</th>\n",
       "      <th>Loan_Amount_Term</th>\n",
       "      <th>Credit_History</th>\n",
       "      <th>Property_Area</th>\n",
       "      <th>Loan_Status</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>LP001002</td>\n",
       "      <td>Male</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>Graduate</td>\n",
       "      <td>No</td>\n",
       "      <td>5849</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Urban</td>\n",
       "      <td>Y</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>LP001003</td>\n",
       "      <td>Male</td>\n",
       "      <td>Yes</td>\n",
       "      <td>1</td>\n",
       "      <td>Graduate</td>\n",
       "      <td>No</td>\n",
       "      <td>4583</td>\n",
       "      <td>1508.0</td>\n",
       "      <td>128.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Rural</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>LP001005</td>\n",
       "      <td>Male</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0</td>\n",
       "      <td>Graduate</td>\n",
       "      <td>Yes</td>\n",
       "      <td>3000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>66.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Urban</td>\n",
       "      <td>Y</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>LP001006</td>\n",
       "      <td>Male</td>\n",
       "      <td>Yes</td>\n",
       "      <td>0</td>\n",
       "      <td>Not Graduate</td>\n",
       "      <td>No</td>\n",
       "      <td>2583</td>\n",
       "      <td>2358.0</td>\n",
       "      <td>120.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Urban</td>\n",
       "      <td>Y</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>LP001008</td>\n",
       "      <td>Male</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>Graduate</td>\n",
       "      <td>No</td>\n",
       "      <td>6000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>141.0</td>\n",
       "      <td>360.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Urban</td>\n",
       "      <td>Y</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Loan_ID Gender Married Dependents     Education Self_Employed  \\\n",
       "0  LP001002   Male      No          0      Graduate            No   \n",
       "1  LP001003   Male     Yes          1      Graduate            No   \n",
       "2  LP001005   Male     Yes          0      Graduate           Yes   \n",
       "3  LP001006   Male     Yes          0  Not Graduate            No   \n",
       "4  LP001008   Male      No          0      Graduate            No   \n",
       "\n",
       "   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \\\n",
       "0             5849                0.0         NaN             360.0   \n",
       "1             4583             1508.0       128.0             360.0   \n",
       "2             3000                0.0        66.0             360.0   \n",
       "3             2583             2358.0       120.0             360.0   \n",
       "4             6000                0.0       141.0             360.0   \n",
       "\n",
       "   Credit_History Property_Area Loan_Status  \n",
       "0             1.0         Urban           Y  \n",
       "1             1.0         Rural           N  \n",
       "2             1.0         Urban           Y  \n",
       "3             1.0         Urban           Y  \n",
       "4             1.0         Urban           Y  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Y    0.687296\n",
       "N    0.312704\n",
       "Name: Loan_Status, dtype: float64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# % of target composition\n",
    "train_df.Loan_Status.value_counts()/len(train_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Loan_ID              0.000000\n",
       "Gender               2.117264\n",
       "Married              0.488599\n",
       "Dependents           2.442997\n",
       "Education            0.000000\n",
       "Self_Employed        5.211726\n",
       "ApplicantIncome      0.000000\n",
       "CoapplicantIncome    0.000000\n",
       "LoanAmount           3.583062\n",
       "Loan_Amount_Term     2.280130\n",
       "Credit_History       8.143322\n",
       "Property_Area        0.000000\n",
       "Loan_Status          0.000000\n",
       "dtype: float64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Null values in columns\n",
    "train_df.isnull().sum() * 100/len(train_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df['Credit_History'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Add a feature EMI. Formula here https://javatutoring.com/wp-content/uploads/2016/12/emi-calculation-formula.jpg\n",
    "# On account of time paucity, rate of interest was avaergaed for gender. Definitely a better way worth investigating later.\n",
    "\n",
    "dataset = [train_df,test_df]\n",
    "for i in dataset:\n",
    "    l = []\n",
    "    for j in i.Gender.index:\n",
    "        if i.Gender[j] == \"Male\":\n",
    "            r = 8.65/(12*100)\n",
    "        else:\n",
    "            r = 8.6/(12*100)\n",
    "        P = i.LoanAmount[j]*1000\n",
    "        n = i.Loan_Amount_Term[j]\n",
    "        E = P*r*(1 + r)**n/((1 + r)**n - 1)\n",
    "        l.append(E)\n",
    "    i[\"EMI\"] = l"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Add a ratio feature EMI. This we be capacitating loan amount with compined income per record\n",
    "\n",
    "for i in dataset:\n",
    "    i[\"income\"] = i[\"ApplicantIncome\"] + i[\"CoapplicantIncome\"]\n",
    "#     i[\"app_income/loan\"] = [x[0]/x[1] for x in zip(i[\"ApplicantIncome\"],i[\"LoanAmount\"])]\n",
    "    i[\"income_loan_ratio\"] = [x[0]/x[1] for x in zip(i[\"income\"],i[\"LoanAmount\"])]\n",
    "    i.drop([\"CoapplicantIncome\"],axis = 1,inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((614, 15), (367, 14))"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "train_df.shape,test_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 614 entries, 0 to 613\n",
      "Data columns (total 15 columns):\n",
      "Loan_ID              614 non-null object\n",
      "Gender               601 non-null object\n",
      "Married              611 non-null object\n",
      "Dependents           599 non-null object\n",
      "Education            614 non-null object\n",
      "Self_Employed        582 non-null object\n",
      "ApplicantIncome      614 non-null int64\n",
      "LoanAmount           592 non-null float64\n",
      "Loan_Amount_Term     600 non-null float64\n",
      "Credit_History       564 non-null float64\n",
      "Property_Area        614 non-null object\n",
      "Loan_Status          614 non-null object\n",
      "EMI                  578 non-null float64\n",
      "income               614 non-null float64\n",
      "income_loan_ratio    592 non-null float64\n",
      "dtypes: float64(6), int64(1), object(8)\n",
      "memory usage: 72.0+ KB\n"
     ]
    }
   ],
   "source": [
    "train_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Nothing fancy. Simply treat missing values based on data type\n",
    "\n",
    "for i in train_df.columns[train_df.dtypes == \"object\"]:\n",
    "    train_df[i].fillna(train_df[i].mode()[0],inplace = True)\n",
    "for i in train_df.columns[train_df.dtypes != \"object\"]:\n",
    "    train_df[i].fillna(train_df[i].mean(),inplace = True)\n",
    "for i in test_df.columns[test_df.dtypes == \"object\"]:\n",
    "    test_df[i].fillna(train_df[i].mode()[0],inplace = True)\n",
    "for i in train_df.columns[train_df.dtypes != \"object\"]:\n",
    "    test_df[i].fillna(test_df[i].mean(),inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Label encode categoricals\n",
    "\n",
    "for i in list(set(train_df.columns[train_df.dtypes == \"object\"]) -set([\"Loan_ID\",'Loan_Status'])):\n",
    "    le = LabelEncoder()\n",
    "    train_df[i] = le.fit_transform(train_df[i].astype(\"str\"))\n",
    "    test_df[i] = le.transform(test_df[i].astype(\"str\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['ApplicantIncome', 'Credit_History', 'Dependents', 'EMI', 'Education', 'Gender', 'LoanAmount', 'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed', 'income', 'income_loan_ratio'] ['ApplicantIncome', 'Credit_History', 'Dependents', 'EMI', 'Education', 'Gender', 'LoanAmount', 'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed', 'income', 'income_loan_ratio']\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
       "       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,\n",
       "       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,\n",
       "       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,\n",
       "       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n",
       "       silent=True, subsample=1)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create and apply model\n",
    "\n",
    "tra_col = sorted(list(set (train_df.columns)- set([\"Loan_ID\",\"Loan_Status\"])))\n",
    "test_df_col = sorted(list(set (test_df.columns)- set([\"Loan_ID\"])))\n",
    "print(tra_col,test_df_col)\n",
    "\n",
    "\n",
    "import xgboost as xgb\n",
    "from xgboost import XGBClassifier\n",
    "model = XGBClassifier()\n",
    "\n",
    "model.fit(train_df[tra_col],train_df[\"Loan_Status\"])\n",
    "# from scipy import sparse\n",
    "#model.predict(test_df[test_df_col])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Predict and create submission\n",
    "\n",
    "result = pd.DataFrame({\"Loan_ID\":(test_df.Loan_ID),\"Loan_Status\":model.predict(test_df[test_df_col])}).reset_index(drop = True)\n",
    "\n",
    "result[[\"Loan_ID\",\"Loan_Status\"]].to_csv(\"loan_prediction_analyticsvidhya.csv\",index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}