{ "cells": [ { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# Import libraries\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.ensemble import RandomForestClassifier\n", "import lightgbm as lgb\n", "import scipy.stats\n", "\n", "datapath='Data/'" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Import data\n", "train_df=pd.read_csv(datapath+'train_u6lujuX_CVtuZ9i.csv')\n", "test_df=pd.read_csv(datapath+'test_Y3wMUE5_7gLdaTN.csv')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Loan_IDGenderMarriedDependentsEducationSelf_EmployedApplicantIncomeCoapplicantIncomeLoanAmountLoan_Amount_TermCredit_HistoryProperty_AreaLoan_Status
0LP001002MaleNo0GraduateNo58490.0NaN360.01.0UrbanY
1LP001003MaleYes1GraduateNo45831508.0128.0360.01.0RuralN
2LP001005MaleYes0GraduateYes30000.066.0360.01.0UrbanY
3LP001006MaleYes0Not GraduateNo25832358.0120.0360.01.0UrbanY
4LP001008MaleNo0GraduateNo60000.0141.0360.01.0UrbanY
\n", "
" ], "text/plain": [ " Loan_ID Gender Married Dependents Education Self_Employed \\\n", "0 LP001002 Male No 0 Graduate No \n", "1 LP001003 Male Yes 1 Graduate No \n", "2 LP001005 Male Yes 0 Graduate Yes \n", "3 LP001006 Male Yes 0 Not Graduate No \n", "4 LP001008 Male No 0 Graduate No \n", "\n", " ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term \\\n", "0 5849 0.0 NaN 360.0 \n", "1 4583 1508.0 128.0 360.0 \n", "2 3000 0.0 66.0 360.0 \n", "3 2583 2358.0 120.0 360.0 \n", "4 6000 0.0 141.0 360.0 \n", "\n", " Credit_History Property_Area Loan_Status \n", "0 1.0 Urban Y \n", "1 1.0 Rural N \n", "2 1.0 Urban Y \n", "3 1.0 Urban Y \n", "4 1.0 Urban Y " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df.head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Y 0.687296\n", "N 0.312704\n", "Name: Loan_Status, dtype: float64" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# % of target composition\n", "train_df.Loan_Status.value_counts()/len(train_df)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Loan_ID 0.000000\n", "Gender 2.117264\n", "Married 0.488599\n", "Dependents 2.442997\n", "Education 0.000000\n", "Self_Employed 5.211726\n", "ApplicantIncome 0.000000\n", "CoapplicantIncome 0.000000\n", "LoanAmount 3.583062\n", "Loan_Amount_Term 2.280130\n", "Credit_History 8.143322\n", "Property_Area 0.000000\n", "Loan_Status 0.000000\n", "dtype: float64" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Null values in columns\n", "train_df.isnull().sum() * 100/len(train_df)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df['Credit_History'].nunique()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "\n", "# Add a feature EMI. Formula here https://javatutoring.com/wp-content/uploads/2016/12/emi-calculation-formula.jpg\n", "# On account of time paucity, rate of interest was avaergaed for gender. Definitely a better way worth investigating later.\n", "\n", "dataset = [train_df,test_df]\n", "for i in dataset:\n", " l = []\n", " for j in i.Gender.index:\n", " if i.Gender[j] == \"Male\":\n", " r = 8.65/(12*100)\n", " else:\n", " r = 8.6/(12*100)\n", " P = i.LoanAmount[j]*1000\n", " n = i.Loan_Amount_Term[j]\n", " E = P*r*(1 + r)**n/((1 + r)**n - 1)\n", " l.append(E)\n", " i[\"EMI\"] = l" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "\n", "# Add a ratio feature EMI. This we be capacitating loan amount with compined income per record\n", "\n", "for i in dataset:\n", " i[\"income\"] = i[\"ApplicantIncome\"] + i[\"CoapplicantIncome\"]\n", "# i[\"app_income/loan\"] = [x[0]/x[1] for x in zip(i[\"ApplicantIncome\"],i[\"LoanAmount\"])]\n", " i[\"income_loan_ratio\"] = [x[0]/x[1] for x in zip(i[\"income\"],i[\"LoanAmount\"])]\n", " i.drop([\"CoapplicantIncome\"],axis = 1,inplace = True)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((614, 15), (367, 14))" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "train_df.shape,test_df.shape" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 614 entries, 0 to 613\n", "Data columns (total 15 columns):\n", "Loan_ID 614 non-null object\n", "Gender 601 non-null object\n", "Married 611 non-null object\n", "Dependents 599 non-null object\n", "Education 614 non-null object\n", "Self_Employed 582 non-null object\n", "ApplicantIncome 614 non-null int64\n", "LoanAmount 592 non-null float64\n", "Loan_Amount_Term 600 non-null float64\n", "Credit_History 564 non-null float64\n", "Property_Area 614 non-null object\n", "Loan_Status 614 non-null object\n", "EMI 578 non-null float64\n", "income 614 non-null float64\n", "income_loan_ratio 592 non-null float64\n", "dtypes: float64(6), int64(1), object(8)\n", "memory usage: 72.0+ KB\n" ] } ], "source": [ "train_df.info()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# Nothing fancy. Simply treat missing values based on data type\n", "\n", "for i in train_df.columns[train_df.dtypes == \"object\"]:\n", " train_df[i].fillna(train_df[i].mode()[0],inplace = True)\n", "for i in train_df.columns[train_df.dtypes != \"object\"]:\n", " train_df[i].fillna(train_df[i].mean(),inplace = True)\n", "for i in test_df.columns[test_df.dtypes == \"object\"]:\n", " test_df[i].fillna(train_df[i].mode()[0],inplace = True)\n", "for i in train_df.columns[train_df.dtypes != \"object\"]:\n", " test_df[i].fillna(test_df[i].mean(),inplace = True)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# Label encode categoricals\n", "\n", "for i in list(set(train_df.columns[train_df.dtypes == \"object\"]) -set([\"Loan_ID\",'Loan_Status'])):\n", " le = LabelEncoder()\n", " train_df[i] = le.fit_transform(train_df[i].astype(\"str\"))\n", " test_df[i] = le.transform(test_df[i].astype(\"str\"))" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['ApplicantIncome', 'Credit_History', 'Dependents', 'EMI', 'Education', 'Gender', 'LoanAmount', 'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed', 'income', 'income_loan_ratio'] ['ApplicantIncome', 'Credit_History', 'Dependents', 'EMI', 'Education', 'Gender', 'LoanAmount', 'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed', 'income', 'income_loan_ratio']\n" ] }, { "data": { "text/plain": [ "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n", " colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,\n", " max_depth=3, min_child_weight=1, missing=None, n_estimators=100,\n", " n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,\n", " reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n", " silent=True, subsample=1)" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create and apply model\n", "\n", "tra_col = sorted(list(set (train_df.columns)- set([\"Loan_ID\",\"Loan_Status\"])))\n", "test_df_col = sorted(list(set (test_df.columns)- set([\"Loan_ID\"])))\n", "print(tra_col,test_df_col)\n", "\n", "\n", "import xgboost as xgb\n", "from xgboost import XGBClassifier\n", "model = XGBClassifier()\n", "\n", "model.fit(train_df[tra_col],train_df[\"Loan_Status\"])\n", "# from scipy import sparse\n", "#model.predict(test_df[test_df_col])" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "#Predict and create submission\n", "\n", "result = pd.DataFrame({\"Loan_ID\":(test_df.Loan_ID),\"Loan_Status\":model.predict(test_df[test_df_col])}).reset_index(drop = True)\n", "\n", "result[[\"Loan_ID\",\"Loan_Status\"]].to_csv(\"loan_prediction_analyticsvidhya.csv\",index = False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.6" } }, "nbformat": 4, "nbformat_minor": 2 }