{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"
Machine Learning Using Python (MEAFA Workshop)
\n",
"Lesson 10: Neural Networks (Regression)
\n",
"
\n",
"\n",
"Credit Card Data
\n",
"Neural Networks
\n",
"Model Evaluation
\n",
"\n",
"\n",
"This notebook relies on the following imports and setting. We will load new functions and libraries in context to make clear what we are using them for. "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Packages\n",
"import numpy as np\n",
"from scipy import stats\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import warnings\n",
"warnings.filterwarnings('ignore') # this is to clear the warnings from this page, usually we should leave them on"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Plot settings\n",
"sns.set_context('notebook') # optimise figures for notebook display\n",
"sns.set_style('ticks') # set default plot style\n",
"colours = ['#1F77B4', '#FF7F0E', '#2CA02C', '#DB2728', '#9467BD', '#8C564B', '#E377C2','#7F7F7F', '#BCBD22', '#17BECF']\n",
"crayon = ['#4E79A7','#F28E2C','#E15759','#76B7B2','#59A14F', '#EDC949','#AF7AA1','#FF9DA7','#9C755F','#BAB0AB']\n",
"sns.set_palette(colours) # set custom color scheme\n",
"%matplotlib inline\n",
"plt.rcParams['figure.figsize'] = (9, 6)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Methods\n",
"from sklearn.linear_model import LinearRegression\n",
"\n",
"# Model selection and evaluation tools\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.model_selection import GridSearchCV, RandomizedSearchCV\n",
"from sklearn.model_selection import cross_val_score\n",
"from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
"\n",
"# Data processing\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import StandardScaler"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Credit Card Data\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Income | \n",
" Limit | \n",
" Rating | \n",
" Cards | \n",
" Age | \n",
" Education | \n",
" Gender | \n",
" Student | \n",
" Married | \n",
" Ethnicity | \n",
" Balance | \n",
"
\n",
" \n",
" Obs | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 14.891 | \n",
" 3606 | \n",
" 283 | \n",
" 2 | \n",
" 34 | \n",
" 11 | \n",
" Male | \n",
" No | \n",
" Yes | \n",
" Caucasian | \n",
" 333 | \n",
"
\n",
" \n",
" 2 | \n",
" 106.025 | \n",
" 6645 | \n",
" 483 | \n",
" 3 | \n",
" 82 | \n",
" 15 | \n",
" Female | \n",
" Yes | \n",
" Yes | \n",
" Asian | \n",
" 903 | \n",
"
\n",
" \n",
" 3 | \n",
" 104.593 | \n",
" 7075 | \n",
" 514 | \n",
" 4 | \n",
" 71 | \n",
" 11 | \n",
" Male | \n",
" No | \n",
" No | \n",
" Asian | \n",
" 580 | \n",
"
\n",
" \n",
" 4 | \n",
" 148.924 | \n",
" 9504 | \n",
" 681 | \n",
" 3 | \n",
" 36 | \n",
" 11 | \n",
" Female | \n",
" No | \n",
" No | \n",
" Asian | \n",
" 964 | \n",
"
\n",
" \n",
" 5 | \n",
" 55.882 | \n",
" 4897 | \n",
" 357 | \n",
" 2 | \n",
" 68 | \n",
" 16 | \n",
" Male | \n",
" No | \n",
" Yes | \n",
" Caucasian | \n",
" 331 | \n",
"
\n",
" \n",
" 6 | \n",
" 80.180 | \n",
" 8047 | \n",
" 569 | \n",
" 4 | \n",
" 77 | \n",
" 10 | \n",
" Male | \n",
" No | \n",
" No | \n",
" Caucasian | \n",
" 1151 | \n",
"
\n",
" \n",
" 7 | \n",
" 20.996 | \n",
" 3388 | \n",
" 259 | \n",
" 2 | \n",
" 37 | \n",
" 12 | \n",
" Female | \n",
" No | \n",
" No | \n",
" African American | \n",
" 203 | \n",
"
\n",
" \n",
" 8 | \n",
" 71.408 | \n",
" 7114 | \n",
" 512 | \n",
" 2 | \n",
" 87 | \n",
" 9 | \n",
" Male | \n",
" No | \n",
" No | \n",
" Asian | \n",
" 872 | \n",
"
\n",
" \n",
" 9 | \n",
" 15.125 | \n",
" 3300 | \n",
" 266 | \n",
" 5 | \n",
" 66 | \n",
" 13 | \n",
" Female | \n",
" No | \n",
" No | \n",
" Caucasian | \n",
" 279 | \n",
"
\n",
" \n",
" 10 | \n",
" 71.061 | \n",
" 6819 | \n",
" 491 | \n",
" 3 | \n",
" 41 | \n",
" 19 | \n",
" Female | \n",
" Yes | \n",
" Yes | \n",
" African American | \n",
" 1350 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Income Limit Rating Cards Age Education Gender Student Married \\\n",
"Obs \n",
"1 14.891 3606 283 2 34 11 Male No Yes \n",
"2 106.025 6645 483 3 82 15 Female Yes Yes \n",
"3 104.593 7075 514 4 71 11 Male No No \n",
"4 148.924 9504 681 3 36 11 Female No No \n",
"5 55.882 4897 357 2 68 16 Male No Yes \n",
"6 80.180 8047 569 4 77 10 Male No No \n",
"7 20.996 3388 259 2 37 12 Female No No \n",
"8 71.408 7114 512 2 87 9 Male No No \n",
"9 15.125 3300 266 5 66 13 Female No No \n",
"10 71.061 6819 491 3 41 19 Female Yes Yes \n",
"\n",
" Ethnicity Balance \n",
"Obs \n",
"1 Caucasian 333 \n",
"2 Asian 903 \n",
"3 Asian 580 \n",
"4 Asian 964 \n",
"5 Caucasian 331 \n",
"6 Caucasian 1151 \n",
"7 African American 203 \n",
"8 Asian 872 \n",
"9 Caucasian 279 \n",
"10 African American 1350 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data=pd.read_csv('Datasets/Credit.csv', index_col='Obs')\n",
"data.head(10)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Randomly split indexes\n",
"index_train, index_test = train_test_split(np.array(data.index), train_size=0.7, random_state=10)\n",
"\n",
"# Write training and test sets \n",
"train = data.loc[index_train,:].copy()\n",
"test = data.loc[index_test,:].copy()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Income | \n",
" Limit | \n",
" Cards | \n",
" Age | \n",
" Education | \n",
" Student | \n",
" Married | \n",
" Balance | \n",
" Male | \n",
" Caucasian | \n",
" Asian | \n",
"
\n",
" \n",
" Obs | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 400 | \n",
" 18.701 | \n",
" 5524 | \n",
" 5 | \n",
" 64 | \n",
" 7 | \n",
" 0 | \n",
" 0 | \n",
" 966 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 26 | \n",
" 14.090 | \n",
" 4323 | \n",
" 5 | \n",
" 25 | \n",
" 16 | \n",
" 0 | \n",
" 1 | \n",
" 671 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 280 | \n",
" 54.319 | \n",
" 3063 | \n",
" 3 | \n",
" 59 | \n",
" 8 | \n",
" 1 | \n",
" 0 | \n",
" 269 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 261 | \n",
" 67.937 | \n",
" 5184 | \n",
" 4 | \n",
" 63 | \n",
" 12 | \n",
" 0 | \n",
" 1 | \n",
" 345 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 131 | \n",
" 23.793 | \n",
" 3821 | \n",
" 4 | \n",
" 56 | \n",
" 12 | \n",
" 1 | \n",
" 1 | \n",
" 868 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Income Limit Cards Age Education Student Married Balance Male \\\n",
"Obs \n",
"400 18.701 5524 5 64 7 0 0 966 0 \n",
"26 14.090 4323 5 25 16 0 1 671 0 \n",
"280 54.319 3063 3 59 8 1 0 269 0 \n",
"261 67.937 5184 4 63 12 0 1 345 1 \n",
"131 23.793 3821 4 56 12 1 1 868 0 \n",
"\n",
" Caucasian Asian \n",
"Obs \n",
"400 0 1 \n",
"26 0 0 \n",
"280 1 0 \n",
"261 0 1 \n",
"131 0 0 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def prepare_data(df):\n",
" df['Male']=(df['Gender'] ==' Male').astype(int) # create dummy variable for gender\n",
" df['Student']=(df['Student'] =='Yes').astype(int)\n",
" df['Married']=(df['Married'] =='Yes').astype(int)\n",
" df['Caucasian']=(df['Ethnicity'] =='Caucasian').astype(int)\n",
" df['Asian']=(df['Ethnicity'] =='Asian').astype(int)\n",
" df=df.loc[:, df.dtypes!='object'] # discards the columns that are not numerical\n",
" df=df.drop('Rating', axis=1) # collinear with limit\n",
" return df\n",
"\n",
"train = prepare_data(train)\n",
"test = prepare_data(test)\n",
"\n",
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(120, 10)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Construting response vector and design matrix (matrix of predictor values) \n",
"response = 'Balance'\n",
"predictors = list(train.columns.values)\n",
"predictors.remove(response)\n",
"\n",
"y_train = train[response].copy()\n",
"y_test = test[response].copy()\n",
"\n",
"X_train=train[predictors].copy() # selects the variables in the predictor list\n",
"\n",
"scaler = StandardScaler()\n",
"scaler.fit(X_train)\n",
"X_train = scaler.transform(X_train)\n",
"X_test = scaler.transform(test[predictors].values)\n",
"\n",
"X_test.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Neural Networks"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n"
]
}
],
"source": [
"from keras.models import Sequential\n",
"from keras.layers import Dense\n",
"from keras.layers import Dropout\n",
"from keras.layers import Activation\n",
"from keras.wrappers.scikit_learn import KerasRegressor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Single Layer Perceptron"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"slp = Sequential()\n",
"slp.add(Dense(24, input_dim=10, init='uniform', activation='relu'))\n",
"slp.add(Dense(1))\n",
"slp.compile(loss='mse', optimizer='adam')\n",
"slp.fit(X_train, y_train, epochs=15000, verbose=0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Using the Scikit-Learn Wrapper"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([-16313.7029001 , -15914.1558259 , -14988.69404694, -29202.73997519,\n",
" -16337.20212685])"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def build_model():\n",
" model = Sequential()\n",
" model.add(Dense(24, input_dim=10, init='uniform', activation='relu'))\n",
" model.add(Dense(1))\n",
" model.compile(loss='mse', optimizer='adam')\n",
" return model\n",
"\n",
"estimator = KerasRegressor(build_fn=build_model, epochs=1000, verbose=0)\n",
"cross_val_score(estimator, X_train, y_train, cv=5, scoring = 'neg_mean_squared_error')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Dropout"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nn = Sequential()\n",
"nn.add(Dense(24, input_dim=10, init='uniform', activation='relu'))\n",
"nn.add(Dropout(0.2)) # adding 20% dropout to the hidden layers\n",
"nn.add(Dense(1))\n",
"nn.compile(loss='mse', optimizer='adam')\n",
"nn.fit(X_train, y_train, epochs=100, verbose=0) # few epochs just for illustration"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model evaluation\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" RMSE | \n",
" R-Squared | \n",
" MAE | \n",
"
\n",
" \n",
" \n",
" \n",
" Linear Regression | \n",
" 97.19 | \n",
" 0.96 | \n",
" 80.03 | \n",
"
\n",
" \n",
" Neural | \n",
" 11.50 | \n",
" 1.00 | \n",
" 8.45 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" RMSE R-Squared MAE\n",
"Linear Regression 97.19 0.96 80.03\n",
"Neural 11.50 1.00 8.45"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Benchmark\n",
"ols = LinearRegression()\n",
"ols.fit(X_train, y_train)\n",
"\n",
"\n",
"# Initialise table\n",
"columns=['RMSE', 'R-Squared', 'MAE']\n",
"rows=['Linear Regression', 'Neural ']\n",
"results =pd.DataFrame(0.0, columns=columns, index=rows)\n",
"\n",
"# List algorithms\n",
"methods = [ols, slp] \n",
"\n",
"# Computer test predictions and metrics\n",
"for i, method in enumerate(methods):\n",
" \n",
" y_pred = method.predict(X_test)\n",
" results.iloc[i, 0] = np.sqrt(mean_squared_error(y_test, y_pred))\n",
" results.iloc[i, 1] = r2_score(y_test, y_pred)\n",
" results.iloc[i, 2] = mean_absolute_error(y_test, y_pred) \n",
"\n",
"results.round(2)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}