{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "

Machine Learning Using Python (MEAFA Workshop)

\n", "

Lesson 10: Neural Networks (Regression)

\n", "
\n", "\n", "Credit Card Data
\n", "Neural Networks
\n", "Model Evaluation
\n", "\n", "\n", "This notebook relies on the following imports and setting. We will load new functions and libraries in context to make clear what we are using them for. " ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Packages\n", "import numpy as np\n", "from scipy import stats\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import warnings\n", "warnings.filterwarnings('ignore') # this is to clear the warnings from this page, usually we should leave them on" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Plot settings\n", "sns.set_context('notebook') # optimise figures for notebook display\n", "sns.set_style('ticks') # set default plot style\n", "colours = ['#1F77B4', '#FF7F0E', '#2CA02C', '#DB2728', '#9467BD', '#8C564B', '#E377C2','#7F7F7F', '#BCBD22', '#17BECF']\n", "crayon = ['#4E79A7','#F28E2C','#E15759','#76B7B2','#59A14F', '#EDC949','#AF7AA1','#FF9DA7','#9C755F','#BAB0AB']\n", "sns.set_palette(colours) # set custom color scheme\n", "%matplotlib inline\n", "plt.rcParams['figure.figsize'] = (9, 6)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Methods\n", "from sklearn.linear_model import LinearRegression\n", "\n", "# Model selection and evaluation tools\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import GridSearchCV, RandomizedSearchCV\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n", "\n", "# Data processing\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import StandardScaler" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Credit Card Data\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IncomeLimitRatingCardsAgeEducationGenderStudentMarriedEthnicityBalance
Obs
114.891360628323411MaleNoYesCaucasian333
2106.025664548338215FemaleYesYesAsian903
3104.593707551447111MaleNoNoAsian580
4148.924950468133611FemaleNoNoAsian964
555.882489735726816MaleNoYesCaucasian331
680.180804756947710MaleNoNoCaucasian1151
720.996338825923712FemaleNoNoAfrican American203
871.40871145122879MaleNoNoAsian872
915.125330026656613FemaleNoNoCaucasian279
1071.061681949134119FemaleYesYesAfrican American1350
\n", "
" ], "text/plain": [ " Income Limit Rating Cards Age Education Gender Student Married \\\n", "Obs \n", "1 14.891 3606 283 2 34 11 Male No Yes \n", "2 106.025 6645 483 3 82 15 Female Yes Yes \n", "3 104.593 7075 514 4 71 11 Male No No \n", "4 148.924 9504 681 3 36 11 Female No No \n", "5 55.882 4897 357 2 68 16 Male No Yes \n", "6 80.180 8047 569 4 77 10 Male No No \n", "7 20.996 3388 259 2 37 12 Female No No \n", "8 71.408 7114 512 2 87 9 Male No No \n", "9 15.125 3300 266 5 66 13 Female No No \n", "10 71.061 6819 491 3 41 19 Female Yes Yes \n", "\n", " Ethnicity Balance \n", "Obs \n", "1 Caucasian 333 \n", "2 Asian 903 \n", "3 Asian 580 \n", "4 Asian 964 \n", "5 Caucasian 331 \n", "6 Caucasian 1151 \n", "7 African American 203 \n", "8 Asian 872 \n", "9 Caucasian 279 \n", "10 African American 1350 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data=pd.read_csv('Datasets/Credit.csv', index_col='Obs')\n", "data.head(10)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "# Randomly split indexes\n", "index_train, index_test = train_test_split(np.array(data.index), train_size=0.7, random_state=10)\n", "\n", "# Write training and test sets \n", "train = data.loc[index_train,:].copy()\n", "test = data.loc[index_test,:].copy()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IncomeLimitCardsAgeEducationStudentMarriedBalanceMaleCaucasianAsian
Obs
40018.7015524564700966001
2614.09043235251601671000
28054.3193063359810269010
26167.93751844631201345101
13123.79338214561211868000
\n", "
" ], "text/plain": [ " Income Limit Cards Age Education Student Married Balance Male \\\n", "Obs \n", "400 18.701 5524 5 64 7 0 0 966 0 \n", "26 14.090 4323 5 25 16 0 1 671 0 \n", "280 54.319 3063 3 59 8 1 0 269 0 \n", "261 67.937 5184 4 63 12 0 1 345 1 \n", "131 23.793 3821 4 56 12 1 1 868 0 \n", "\n", " Caucasian Asian \n", "Obs \n", "400 0 1 \n", "26 0 0 \n", "280 1 0 \n", "261 0 1 \n", "131 0 0 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def prepare_data(df):\n", " df['Male']=(df['Gender'] ==' Male').astype(int) # create dummy variable for gender\n", " df['Student']=(df['Student'] =='Yes').astype(int)\n", " df['Married']=(df['Married'] =='Yes').astype(int)\n", " df['Caucasian']=(df['Ethnicity'] =='Caucasian').astype(int)\n", " df['Asian']=(df['Ethnicity'] =='Asian').astype(int)\n", " df=df.loc[:, df.dtypes!='object'] # discards the columns that are not numerical\n", " df=df.drop('Rating', axis=1) # collinear with limit\n", " return df\n", "\n", "train = prepare_data(train)\n", "test = prepare_data(test)\n", "\n", "train.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(120, 10)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Construting response vector and design matrix (matrix of predictor values) \n", "response = 'Balance'\n", "predictors = list(train.columns.values)\n", "predictors.remove(response)\n", "\n", "y_train = train[response].copy()\n", "y_test = test[response].copy()\n", "\n", "X_train=train[predictors].copy() # selects the variables in the predictor list\n", "\n", "scaler = StandardScaler()\n", "scaler.fit(X_train)\n", "X_train = scaler.transform(X_train)\n", "X_test = scaler.transform(test[predictors].values)\n", "\n", "X_test.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Neural Networks" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using TensorFlow backend.\n" ] } ], "source": [ "from keras.models import Sequential\n", "from keras.layers import Dense\n", "from keras.layers import Dropout\n", "from keras.layers import Activation\n", "from keras.wrappers.scikit_learn import KerasRegressor" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Single Layer Perceptron" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "slp = Sequential()\n", "slp.add(Dense(24, input_dim=10, init='uniform', activation='relu'))\n", "slp.add(Dense(1))\n", "slp.compile(loss='mse', optimizer='adam')\n", "slp.fit(X_train, y_train, epochs=15000, verbose=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using the Scikit-Learn Wrapper" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-16313.7029001 , -15914.1558259 , -14988.69404694, -29202.73997519,\n", " -16337.20212685])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def build_model():\n", " model = Sequential()\n", " model.add(Dense(24, input_dim=10, init='uniform', activation='relu'))\n", " model.add(Dense(1))\n", " model.compile(loss='mse', optimizer='adam')\n", " return model\n", "\n", "estimator = KerasRegressor(build_fn=build_model, epochs=1000, verbose=0)\n", "cross_val_score(estimator, X_train, y_train, cv=5, scoring = 'neg_mean_squared_error')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dropout" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nn = Sequential()\n", "nn.add(Dense(24, input_dim=10, init='uniform', activation='relu'))\n", "nn.add(Dropout(0.2)) # adding 20% dropout to the hidden layers\n", "nn.add(Dense(1))\n", "nn.compile(loss='mse', optimizer='adam')\n", "nn.fit(X_train, y_train, epochs=100, verbose=0) # few epochs just for illustration" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Model evaluation\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RMSER-SquaredMAE
Linear Regression97.190.9680.03
Neural11.501.008.45
\n", "
" ], "text/plain": [ " RMSE R-Squared MAE\n", "Linear Regression 97.19 0.96 80.03\n", "Neural 11.50 1.00 8.45" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Benchmark\n", "ols = LinearRegression()\n", "ols.fit(X_train, y_train)\n", "\n", "\n", "# Initialise table\n", "columns=['RMSE', 'R-Squared', 'MAE']\n", "rows=['Linear Regression', 'Neural ']\n", "results =pd.DataFrame(0.0, columns=columns, index=rows)\n", "\n", "# List algorithms\n", "methods = [ols, slp] \n", "\n", "# Computer test predictions and metrics\n", "for i, method in enumerate(methods):\n", " \n", " y_pred = method.predict(X_test)\n", " results.iloc[i, 0] = np.sqrt(mean_squared_error(y_test, y_pred))\n", " results.iloc[i, 1] = r2_score(y_test, y_pred)\n", " results.iloc[i, 2] = mean_absolute_error(y_test, y_pred) \n", "\n", "results.round(2)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 1 }