{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"
Machine Learning Using Python (MEAFA Workshop)
\n",
"Lesson 10: Neural Networks (Classification)
\n",
"
\n",
" \n",
"Work Analytics Data
\n",
"Neural Networks
\n",
"Model Evaluation
\n",
"\n",
"This notebook relies on the following libraries and settings."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Packages\n",
"import numpy as np\n",
"from scipy import stats\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import warnings\n",
"warnings.filterwarnings('ignore') "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Plot settings\n",
"sns.set_context('notebook') \n",
"sns.set_style('ticks') \n",
"colours = ['#1F77B4', '#FF7F0E', '#2CA02C', '#DB2728', '#9467BD', '#8C564B', '#E377C2','#7F7F7F', '#BCBD22', '#17BECF']\n",
"crayon = ['#4E79A7','#F28E2C','#E15759','#76B7B2','#59A14F', '#EDC949','#AF7AA1','#FF9DA7','#9C755F','#BAB0AB']\n",
"sns.set_palette(colours)\n",
"%matplotlib inline\n",
"plt.rcParams['figure.figsize'] = (9, 6)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.model_selection import GridSearchCV, RandomizedSearchCV\n",
"from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score\n",
"from sklearn.metrics import precision_score, average_precision_score, log_loss\n",
"\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.pipeline import Pipeline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Work Analytics Data\n",
"\n",
"We use the [Human Resources Analytics](https://www.kaggle.com/ludobenistant/hr-analytics) data available from [Kaggle Datasets](https://www.kaggle.com/datasets).\n",
"\n",
"** Business objective: ** To predict which employees will leave the company."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" satisfaction_level | \n",
" last_evaluation | \n",
" number_project | \n",
" average_montly_hours | \n",
" time_spend_company | \n",
" Work_accident | \n",
" left | \n",
" promotion_last_5years | \n",
" role | \n",
" salary | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.38 | \n",
" 0.53 | \n",
" 2 | \n",
" 157 | \n",
" 3 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" sales | \n",
" low | \n",
"
\n",
" \n",
" 1 | \n",
" 0.80 | \n",
" 0.86 | \n",
" 5 | \n",
" 262 | \n",
" 6 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" sales | \n",
" medium | \n",
"
\n",
" \n",
" 2 | \n",
" 0.11 | \n",
" 0.88 | \n",
" 7 | \n",
" 272 | \n",
" 4 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" sales | \n",
" medium | \n",
"
\n",
" \n",
" 3 | \n",
" 0.72 | \n",
" 0.87 | \n",
" 5 | \n",
" 223 | \n",
" 5 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" sales | \n",
" low | \n",
"
\n",
" \n",
" 4 | \n",
" 0.37 | \n",
" 0.52 | \n",
" 2 | \n",
" 159 | \n",
" 3 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" sales | \n",
" low | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" satisfaction_level last_evaluation number_project average_montly_hours \\\n",
"0 0.38 0.53 2 157 \n",
"1 0.80 0.86 5 262 \n",
"2 0.11 0.88 7 272 \n",
"3 0.72 0.87 5 223 \n",
"4 0.37 0.52 2 159 \n",
"\n",
" time_spend_company Work_accident left promotion_last_5years role \\\n",
"0 3 0 1 0 sales \n",
"1 6 0 1 0 sales \n",
"2 4 0 1 0 sales \n",
"3 5 0 1 0 sales \n",
"4 3 0 1 0 sales \n",
"\n",
" salary \n",
"0 low \n",
"1 medium \n",
"2 medium \n",
"3 low \n",
"4 low "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.read_csv('Datasets/HR.csv')\n",
"data = data.rename(columns = {'sales' : 'role'})\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"response='left'\n",
"predictors = list(data.columns.values)\n",
"predictors.remove(response)\n",
"\n",
"index_train, index_test = train_test_split(np.array(data.index), stratify=data[response], train_size=0.2, random_state=5)\n",
"\n",
"train = data.loc[index_train,].copy()\n",
"test = data.loc[index_test,:].copy()\n",
"\n",
"y_train = train[response]\n",
"y_test = test[response]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Before estimating the models, we need to convert the categorical variables into binary variables. "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dummies = pd.get_dummies(data['role'], drop_first=True)\n",
"data = data.join(dummies)\n",
"data = data.drop('role', axis= 1)\n",
"\n",
"dummies = pd.get_dummies(data['salary'], prefix = 'salary', drop_first=True)\n",
"data = data.join(dummies)\n",
"data = data.drop('salary', axis= 1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We then update our list of predictors accordingly and construct the train and test design matrices. "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(2999, 18)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train = data.loc[index_train,].copy()\n",
"test = data.loc[index_test,:].copy()\n",
"\n",
"predictors = list(train.columns.values)\n",
"predictors.remove(response)\n",
"\n",
"X_train = data.loc[index_train, predictors].copy()\n",
"\n",
"scaler = StandardScaler()\n",
"scaler.fit(X_train)\n",
"\n",
"X_train = scaler.transform(X_train)\n",
"X_test = scaler.transform(test[predictors].values)\n",
"\n",
"X_train.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Neural Networks"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n"
]
}
],
"source": [
"from keras.models import Sequential\n",
"from keras.layers import Dense\n",
"from keras.layers import Dropout\n",
"from keras.layers import Activation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Two Hidden Layers"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mlp = Sequential()\n",
"mlp.add(Dense(128, input_dim=X_train.shape[1], init='uniform', activation='relu'))\n",
"mlp.add(Dense(64, init='uniform', activation='relu'))\n",
"mlp.add(Dense(1, activation='sigmoid'))\n",
"mlp.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
"mlp.fit(X_train, y_train, epochs=500, verbose=0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Dropout"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = Sequential()\n",
"model.add(Dense(128, input_dim=X_train.shape[1], init='uniform', activation='relu'))\n",
"model.add(Dropout(0.5))\n",
"model.add(Dense(64, init='uniform', activation='relu'))\n",
"model.add(Dense(1, activation='sigmoid'))\n",
"model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
"model.fit(X_train, y_train, epochs=500, verbose=0)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model Evaluation\n",
"\n",
"We estimate a logistic regression as a benchmark."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LogisticRegression(C=100000.0, class_weight=None, dual=False,\n",
" fit_intercept=True, intercept_scaling=1, max_iter=100,\n",
" multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,\n",
" solver='liblinear', tol=0.0001, verbose=0, warm_start=False)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"logit = LogisticRegression(C=1e5)\n",
"logit.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Error rate | \n",
" Sensitivity | \n",
" Specificity | \n",
" AUC | \n",
" Precision | \n",
"
\n",
" \n",
" \n",
" \n",
" Logistic regression | \n",
" 0.225 | \n",
" 0.279 | \n",
" 0.930 | \n",
" 0.822 | \n",
" 0.556 | \n",
"
\n",
" \n",
" Neural Net | \n",
" 0.039 | \n",
" 0.922 | \n",
" 0.973 | \n",
" 0.972 | \n",
" 0.915 | \n",
"
\n",
" \n",
" Neural Net (dropout) | \n",
" 0.034 | \n",
" 0.913 | \n",
" 0.982 | \n",
" 0.974 | \n",
" 0.941 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Error rate Sensitivity Specificity AUC Precision\n",
"Logistic regression 0.225 0.279 0.930 0.822 0.556\n",
"Neural Net 0.039 0.922 0.973 0.972 0.915\n",
"Neural Net (dropout) 0.034 0.913 0.982 0.974 0.941"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"columns=['Error rate', 'Sensitivity', 'Specificity', 'AUC', 'Precision']\n",
"rows=['Logistic regression', 'Neural Net', 'Neural Net (dropout)']\n",
"results=pd.DataFrame(0.0, columns=columns, index=rows) \n",
"\n",
"methods=[logit, mlp, model]\n",
"\n",
"for i, method in enumerate(methods):\n",
" \n",
" y_pred = method.predict(X_test)\n",
" if method != logit:\n",
" y_prob = y_pred\n",
" y_pred = (y_pred>0.5).astype(int)\n",
" else:\n",
" y_prob = method.predict_proba(X_test)[:,1]\n",
" \n",
"\n",
"\n",
" confusion = confusion_matrix(y_test, y_pred)\n",
" results.iloc[i,0]= 1 - accuracy_score(y_test, y_pred)\n",
" results.iloc[i,1]= confusion[1,1]/np.sum(confusion[1,:])\n",
" results.iloc[i,2]= confusion[0,0]/np.sum(confusion[0,:])\n",
" results.iloc[i,4]= precision_score(y_test, y_pred)\n",
" results.iloc[i,3] = roc_auc_score(y_test, y_prob)\n",
"\n",
"results.round(3)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}