{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# preprocessing of data set" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.preprocessing import LabelEncoder\n", "\n", "train = pd.read_csv(\"https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/train.csv\")\n", "test = pd.read_csv(\"https://s3-ap-southeast-1.amazonaws.com/av-datahack-datacamp/test.csv\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "ApplicantIncome 0\n", "CoapplicantIncome 0\n", "Credit_History 79\n", "Dependents 25\n", "Education 0\n", "Gender 24\n", "LoanAmount 27\n", "Loan_Amount_Term 20\n", "Loan_ID 0\n", "Loan_Status 367\n", "Married 3\n", "Property_Area 0\n", "Self_Employed 55\n", "Type 0\n", "dtype: int64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Combining both train and test dataset\n", "\n", "train['Type']='Train' #Create a flag for Train and Test Data set\n", "test['Type']='Test'\n", "fullData = pd.concat([train,test],axis=0)\n", "\n", "#Look at the available missing values in the dataset\n", "fullData.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Identify categorical and continuous variables\n", "ID_col = ['Loan_ID']\n", "target_col = [\"Loan_Status\"]\n", "cat_cols = ['Credit_History','Dependents','Gender','Married','Education','Property_Area','Self_Employed']\n", "\n", "other_col=['Type'] #Test and Train Data set identifier\n", "num_cols= list(set(list(fullData.columns))-set(cat_cols)-set(ID_col)-set(target_col)-set(other_col))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\abc\\Anaconda2\\lib\\site-packages\\pandas\\core\\generic.py:3178: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", " self._update_inplace(new_data)\n" ] } ], "source": [ "#Imputing Missing values with mean for continuous variable\n", "fullData[num_cols] = fullData[num_cols].fillna(fullData[num_cols].mean(),inplace=True)\n", "\n", "\n", "#Imputing Missing values with mode for categorical variables\n", "cat_imput=pd.Series(fullData[cat_cols].mode().values[0])\n", "cat_imput.index=cat_cols\n", "fullData[cat_cols] = fullData[cat_cols].fillna(cat_imput,inplace=True)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#Create a new column as Total Income\n", "\n", "fullData['TotalIncome']=fullData['ApplicantIncome']+fullData['CoapplicantIncome']\n", "\n", "#Take a log of TotalIncome + 1, adding 1 to deal with zeros of TotalIncome it it exists\n", "fullData['Log_TotalIncome']=np.log(fullData['TotalIncome'])\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\abc\\Anaconda2\\lib\\site-packages\\ipykernel\\__main__.py:8: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" ] } ], "source": [ "#create label encoders for categorical features\n", "for var in cat_cols:\n", " number = LabelEncoder()\n", " fullData[var] = number.fit_transform(fullData[var].astype('str'))\n", "\n", "train_modified=fullData[fullData['Type']=='Train']\n", "test_modified=fullData[fullData['Type']=='Test']\n", "train_modified[\"Loan_Status\"] = number.fit_transform(train_modified[\"Loan_Status\"].astype('str'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Building Logistic Regression" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression\n", "\n", "\n", "predictors=['Credit_History','Education','Gender']\n", "\n", "x_train = train_modified[list(predictors)].values\n", "y_train = train_modified[\"Loan_Status\"].values\n", "\n", "x_test=test_modified[list(predictors)].values" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\abc\\Anaconda2\\lib\\site-packages\\ipykernel\\__main__.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" ] } ], "source": [ "# Create logistic regression object\n", "model = LogisticRegression()\n", "\n", "# Train the model using the training sets\n", "model.fit(x_train, y_train)\n", "\n", "#Predict Output\n", "predicted= model.predict(x_test)\n", "\n", "#Reverse encoding for predicted outcome\n", "predicted = number.inverse_transform(predicted)\n", "\n", "#Store it to test dataset\n", "test_modified['Loan_Status']=predicted\n", "\n", "#Output file to make submission\n", "test_modified.to_csv(\"Submission1.csv\",columns=['Loan_ID','Loan_Status'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Building Decision Tree Classifier" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "predictors=['Credit_History','Education','Gender']\n", "\n", "x_train = train_modified[list(predictors)].values\n", "y_train = train_modified[\"Loan_Status\"].values\n", "\n", "x_test=test_modified[list(predictors)].values" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\abc\\Anaconda2\\lib\\site-packages\\ipykernel\\__main__.py:16: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" ] } ], "source": [ "from sklearn.tree import DecisionTreeClassifier\n", "\n", "# Create Decision Tree object\n", "model = DecisionTreeClassifier()\n", "\n", "# Train the model using the training sets\n", "model.fit(x_train, y_train)\n", "\n", "#Predict Output\n", "predicted= model.predict(x_test)\n", "\n", "#Reverse encoding for predicted outcome\n", "predicted = number.inverse_transform(predicted)\n", "\n", "#Store it to test dataset\n", "test_modified['Loan_Status']=predicted\n", "\n", "#Output file to make submission\n", "test_modified.to_csv(\"Submission2.csv\",columns=['Loan_ID','Loan_Status'])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Building Random Forest Classifier" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.linear_model import LogisticRegression\n", "\n", "\n", "predictors=['ApplicantIncome', 'CoapplicantIncome', 'Credit_History','Dependents', 'Education', 'Gender', 'LoanAmount',\n", " 'Loan_Amount_Term', 'Married', 'Property_Area', 'Self_Employed', 'TotalIncome','Log_TotalIncome']\n", "\n", "x_train = train_modified[list(predictors)].values\n", "y_train = train_modified[\"Loan_Status\"].values\n", "\n", "x_test=test_modified[list(predictors)].values" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\abc\\Anaconda2\\lib\\site-packages\\ipykernel\\__main__.py:16: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n" ] } ], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", "# Create Decision Tree object\n", "model = RandomForestClassifier()\n", "\n", "# Train the model using the training sets\n", "model.fit(x_train, y_train)\n", "\n", "#Predict Output\n", "predicted= model.predict(x_test)\n", "\n", "#Reverse encoding for predicted outcome\n", "predicted = number.inverse_transform(predicted)\n", "\n", "#Store it to test dataset\n", "test_modified['Loan_Status']=predicted\n", "\n", "#Output file to make submission\n", "test_modified.to_csv(\"Submission3.csv\",columns=['Loan_ID','Loan_Status'])\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Credit_History 0.232724\n", "TotalIncome 0.146955\n", "LoanAmount 0.128687\n", "ApplicantIncome 0.114424\n", "Log_TotalIncome 0.113866\n", "CoapplicantIncome 0.082272\n", "Dependents 0.038125\n", "Property_Area 0.036118\n", "Loan_Amount_Term 0.032650\n", "Married 0.022713\n", "Self_Employed 0.022481\n", "Education 0.016459\n", "Gender 0.012527\n", "dtype: float64\n" ] } ], "source": [ "#Create a series with feature importances:\n", "featimp = pd.Series(model.feature_importances_, index=predictors).sort_values(ascending=False)\n", "print featimp" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "number = LabelEncoder()\n", "train['Gender'] = number.fit_transform(train['Gender'].astype('str'))\n", " " ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 1\n", "1 1\n", "2 1\n", "3 1\n", "4 1\n", "5 1\n", "6 1\n", "7 1\n", "8 1\n", "9 1\n", "10 1\n", "11 1\n", "12 1\n", "13 1\n", "14 1\n", "15 1\n", "16 1\n", "17 0\n", "18 1\n", "19 1\n", "20 1\n", "21 1\n", "22 1\n", "23 2\n", "24 1\n", "25 1\n", "26 1\n", "27 1\n", "28 1\n", "29 0\n", " ..\n", "584 1\n", "585 1\n", "586 1\n", "587 0\n", "588 2\n", "589 1\n", "590 1\n", "591 1\n", "592 2\n", "593 1\n", "594 1\n", "595 1\n", "596 1\n", "597 1\n", "598 1\n", "599 1\n", "600 0\n", "601 1\n", "602 1\n", "603 1\n", "604 0\n", "605 1\n", "606 1\n", "607 1\n", "608 1\n", "609 0\n", "610 1\n", "611 1\n", "612 1\n", "613 0\n", "Name: Gender, dtype: int64" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.Gender" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.11" } }, "nbformat": 4, "nbformat_minor": 0 }