{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data retrieval"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Name | \n",
" OverallGrade | \n",
" Obedient | \n",
" ResearchScore | \n",
" ProjectScore | \n",
" Recommend | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Henry | \n",
" A | \n",
" Y | \n",
" 90 | \n",
" 85 | \n",
" Yes | \n",
"
\n",
" \n",
" 1 | \n",
" John | \n",
" C | \n",
" N | \n",
" 85 | \n",
" 51 | \n",
" Yes | \n",
"
\n",
" \n",
" 2 | \n",
" David | \n",
" F | \n",
" N | \n",
" 10 | \n",
" 17 | \n",
" No | \n",
"
\n",
" \n",
" 3 | \n",
" Holmes | \n",
" B | \n",
" Y | \n",
" 75 | \n",
" 71 | \n",
" No | \n",
"
\n",
" \n",
" 4 | \n",
" Marvin | \n",
" E | \n",
" N | \n",
" 20 | \n",
" 30 | \n",
" No | \n",
"
\n",
" \n",
" 5 | \n",
" Simon | \n",
" A | \n",
" Y | \n",
" 92 | \n",
" 79 | \n",
" Yes | \n",
"
\n",
" \n",
" 6 | \n",
" Robert | \n",
" B | \n",
" Y | \n",
" 60 | \n",
" 59 | \n",
" No | \n",
"
\n",
" \n",
" 7 | \n",
" Trent | \n",
" C | \n",
" Y | \n",
" 75 | \n",
" 33 | \n",
" No | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Name OverallGrade Obedient ResearchScore ProjectScore Recommend\n",
"0 Henry A Y 90 85 Yes\n",
"1 John C N 85 51 Yes\n",
"2 David F N 10 17 No\n",
"3 Holmes B Y 75 71 No\n",
"4 Marvin E N 20 30 No\n",
"5 Simon A Y 92 79 Yes\n",
"6 Robert B Y 60 59 No\n",
"7 Trent C Y 75 33 No"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"# turn of warning messages\n",
"pd.options.mode.chained_assignment = None # default='warn'\n",
"\n",
"# get data\n",
"df = pd.read_csv('student_records.csv')\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data preparation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Feature extraction and engineering"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# get features and corresponding outcomes\n",
"feature_names = ['OverallGrade', 'Obedient', 'ResearchScore', 'ProjectScore']\n",
"training_features = df[feature_names]\n",
"\n",
"outcome_name = ['Recommend']\n",
"outcome_labels = df[outcome_name]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" OverallGrade | \n",
" Obedient | \n",
" ResearchScore | \n",
" ProjectScore | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" A | \n",
" Y | \n",
" 90 | \n",
" 85 | \n",
"
\n",
" \n",
" 1 | \n",
" C | \n",
" N | \n",
" 85 | \n",
" 51 | \n",
"
\n",
" \n",
" 2 | \n",
" F | \n",
" N | \n",
" 10 | \n",
" 17 | \n",
"
\n",
" \n",
" 3 | \n",
" B | \n",
" Y | \n",
" 75 | \n",
" 71 | \n",
"
\n",
" \n",
" 4 | \n",
" E | \n",
" N | \n",
" 20 | \n",
" 30 | \n",
"
\n",
" \n",
" 5 | \n",
" A | \n",
" Y | \n",
" 92 | \n",
" 79 | \n",
"
\n",
" \n",
" 6 | \n",
" B | \n",
" Y | \n",
" 60 | \n",
" 59 | \n",
"
\n",
" \n",
" 7 | \n",
" C | \n",
" Y | \n",
" 75 | \n",
" 33 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" OverallGrade Obedient ResearchScore ProjectScore\n",
"0 A Y 90 85\n",
"1 C N 85 51\n",
"2 F N 10 17\n",
"3 B Y 75 71\n",
"4 E N 20 30\n",
"5 A Y 92 79\n",
"6 B Y 60 59\n",
"7 C Y 75 33"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# view features\n",
"training_features"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Recommend | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Yes | \n",
"
\n",
" \n",
" 1 | \n",
" Yes | \n",
"
\n",
" \n",
" 2 | \n",
" No | \n",
"
\n",
" \n",
" 3 | \n",
" No | \n",
"
\n",
" \n",
" 4 | \n",
" No | \n",
"
\n",
" \n",
" 5 | \n",
" Yes | \n",
"
\n",
" \n",
" 6 | \n",
" No | \n",
"
\n",
" \n",
" 7 | \n",
" No | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Recommend\n",
"0 Yes\n",
"1 Yes\n",
"2 No\n",
"3 No\n",
"4 No\n",
"5 Yes\n",
"6 No\n",
"7 No"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# view outcome labels\n",
"outcome_labels"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# list down features based on type\n",
"numeric_feature_names = ['ResearchScore', 'ProjectScore']\n",
"categoricial_feature_names = ['OverallGrade', 'Obedient']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Numeric Feature Scaling"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" OverallGrade | \n",
" Obedient | \n",
" ResearchScore | \n",
" ProjectScore | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" A | \n",
" Y | \n",
" 0.899583 | \n",
" 1.376650 | \n",
"
\n",
" \n",
" 1 | \n",
" C | \n",
" N | \n",
" 0.730648 | \n",
" -0.091777 | \n",
"
\n",
" \n",
" 2 | \n",
" F | \n",
" N | \n",
" -1.803390 | \n",
" -1.560203 | \n",
"
\n",
" \n",
" 3 | \n",
" B | \n",
" Y | \n",
" 0.392776 | \n",
" 0.772004 | \n",
"
\n",
" \n",
" 4 | \n",
" E | \n",
" N | \n",
" -1.465519 | \n",
" -0.998746 | \n",
"
\n",
" \n",
" 5 | \n",
" A | \n",
" Y | \n",
" 0.967158 | \n",
" 1.117516 | \n",
"
\n",
" \n",
" 6 | \n",
" B | \n",
" Y | \n",
" -0.114032 | \n",
" 0.253735 | \n",
"
\n",
" \n",
" 7 | \n",
" C | \n",
" Y | \n",
" 0.392776 | \n",
" -0.869179 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" OverallGrade Obedient ResearchScore ProjectScore\n",
"0 A Y 0.899583 1.376650\n",
"1 C N 0.730648 -0.091777\n",
"2 F N -1.803390 -1.560203\n",
"3 B Y 0.392776 0.772004\n",
"4 E N -1.465519 -0.998746\n",
"5 A Y 0.967158 1.117516\n",
"6 B Y -0.114032 0.253735\n",
"7 C Y 0.392776 -0.869179"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.preprocessing import StandardScaler\n",
"ss = StandardScaler()\n",
"\n",
"# fit scaler on numeric features\n",
"ss.fit(training_features[numeric_feature_names])\n",
"\n",
"# scale numeric features now\n",
"training_features[numeric_feature_names] = ss.transform(training_features[numeric_feature_names])\n",
"\n",
"# view updated featureset\n",
"training_features"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Engineering Categorical Features"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ResearchScore | \n",
" ProjectScore | \n",
" OverallGrade_A | \n",
" OverallGrade_B | \n",
" OverallGrade_C | \n",
" OverallGrade_E | \n",
" OverallGrade_F | \n",
" Obedient_N | \n",
" Obedient_Y | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.899583 | \n",
" 1.376650 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 0.730648 | \n",
" -0.091777 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" -1.803390 | \n",
" -1.560203 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.392776 | \n",
" 0.772004 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" -1.465519 | \n",
" -0.998746 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 5 | \n",
" 0.967158 | \n",
" 1.117516 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 6 | \n",
" -0.114032 | \n",
" 0.253735 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 7 | \n",
" 0.392776 | \n",
" -0.869179 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ResearchScore ProjectScore OverallGrade_A OverallGrade_B \\\n",
"0 0.899583 1.376650 1 0 \n",
"1 0.730648 -0.091777 0 0 \n",
"2 -1.803390 -1.560203 0 0 \n",
"3 0.392776 0.772004 0 1 \n",
"4 -1.465519 -0.998746 0 0 \n",
"5 0.967158 1.117516 1 0 \n",
"6 -0.114032 0.253735 0 1 \n",
"7 0.392776 -0.869179 0 0 \n",
"\n",
" OverallGrade_C OverallGrade_E OverallGrade_F Obedient_N Obedient_Y \n",
"0 0 0 0 0 1 \n",
"1 1 0 0 1 0 \n",
"2 0 0 1 1 0 \n",
"3 0 0 0 0 1 \n",
"4 0 1 0 1 0 \n",
"5 0 0 0 0 1 \n",
"6 0 0 0 0 1 \n",
"7 1 0 0 0 1 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"training_features = pd.get_dummies(training_features, columns=categoricial_feature_names)\n",
"# view newly engineering features\n",
"training_features"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# get list of new categorical features\n",
"categorical_engineered_features = list(set(training_features.columns) - set(numeric_feature_names))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Modeling"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
" intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n",
" penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n",
" verbose=0, warm_start=False)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"import numpy as np\n",
"\n",
"# fit the model\n",
"lr = LogisticRegression() \n",
"model = lr.fit(training_features, np.array(outcome_labels['Recommend']))\n",
"# view model parameters\n",
"model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model Evaluation"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 100.0 %\n",
"Classification Stats:\n",
" precision recall f1-score support\n",
"\n",
" No 1.00 1.00 1.00 5\n",
" Yes 1.00 1.00 1.00 3\n",
"\n",
"avg / total 1.00 1.00 1.00 8\n",
"\n"
]
}
],
"source": [
"# simple evaluation on training data\n",
"pred_labels = model.predict(training_features)\n",
"actual_labels = np.array(outcome_labels['Recommend'])\n",
"\n",
"# evaluate model performance\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.metrics import classification_report\n",
"\n",
"print('Accuracy:', float(accuracy_score(actual_labels, pred_labels))*100, '%')\n",
"print('Classification Stats:')\n",
"print(classification_report(actual_labels, pred_labels))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model Deployment "
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Scaler/scaler.pickle',\n",
" 'Scaler/scaler.pickle_01.npy',\n",
" 'Scaler/scaler.pickle_02.npy',\n",
" 'Scaler/scaler.pickle_03.npy']"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.externals import joblib\n",
"import os\n",
"# save models to be deployed on your server\n",
"if not os.path.exists('Model'):\n",
" os.mkdir('Model')\n",
"if not os.path.exists('Scaler'):\n",
" os.mkdir('Scaler') \n",
" \n",
"joblib.dump(model, r'Model/model.pickle') \n",
"joblib.dump(ss, r'Scaler/scaler.pickle') "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prediction in Action"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# load model and scaler objects\n",
"model = joblib.load(r'Model/model.pickle')\n",
"scaler = joblib.load(r'Scaler/scaler.pickle')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Name | \n",
" OverallGrade | \n",
" Obedient | \n",
" ResearchScore | \n",
" ProjectScore | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Nathan | \n",
" F | \n",
" N | \n",
" 30 | \n",
" 20 | \n",
"
\n",
" \n",
" 1 | \n",
" Thomas | \n",
" A | \n",
" Y | \n",
" 78 | \n",
" 80 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Name OverallGrade Obedient ResearchScore ProjectScore\n",
"0 Nathan F N 30 20\n",
"1 Thomas A Y 78 80"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## data retrieval\n",
"new_data = pd.DataFrame([{'Name': 'Nathan', 'OverallGrade': 'F', 'Obedient': 'N', 'ResearchScore': 30, 'ProjectScore': 20},\n",
" {'Name': 'Thomas', 'OverallGrade': 'A', 'Obedient': 'Y', 'ResearchScore': 78, 'ProjectScore': 80}])\n",
"new_data = new_data[['Name', 'OverallGrade', 'Obedient', 'ResearchScore', 'ProjectScore']]\n",
"new_data"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ResearchScore | \n",
" ProjectScore | \n",
" OverallGrade_A | \n",
" OverallGrade_F | \n",
" Obedient_N | \n",
" Obedient_Y | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" -1.127647 | \n",
" -1.430636 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 0.494137 | \n",
" 1.160705 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ResearchScore ProjectScore OverallGrade_A OverallGrade_F Obedient_N \\\n",
"0 -1.127647 -1.430636 0 1 1 \n",
"1 0.494137 1.160705 1 0 0 \n",
"\n",
" Obedient_Y \n",
"0 0 \n",
"1 1 "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## data preparation\n",
"prediction_features = new_data[feature_names]\n",
"\n",
"# scaling\n",
"prediction_features[numeric_feature_names] = scaler.transform(prediction_features[numeric_feature_names])\n",
"\n",
"# engineering categorical variables\n",
"prediction_features = pd.get_dummies(prediction_features, columns=categoricial_feature_names)\n",
"\n",
"# view feature set\n",
"prediction_features"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ResearchScore | \n",
" ProjectScore | \n",
" OverallGrade_A | \n",
" OverallGrade_F | \n",
" Obedient_N | \n",
" Obedient_Y | \n",
" OverallGrade_B | \n",
" OverallGrade_E | \n",
" OverallGrade_C | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" -1.127647 | \n",
" -1.430636 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 0.494137 | \n",
" 1.160705 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ResearchScore ProjectScore OverallGrade_A OverallGrade_F Obedient_N \\\n",
"0 -1.127647 -1.430636 0 1 1 \n",
"1 0.494137 1.160705 1 0 0 \n",
"\n",
" Obedient_Y OverallGrade_B OverallGrade_E OverallGrade_C \n",
"0 0 0 0 0 \n",
"1 1 0 0 0 "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# add missing categorical feature columns\n",
"current_categorical_engineered_features = set(prediction_features.columns) - set(numeric_feature_names)\n",
"missing_features = set(categorical_engineered_features) - current_categorical_engineered_features\n",
"for feature in missing_features:\n",
" # add zeros since feature is absent in these data samples\n",
" prediction_features[feature] = [0] * len(prediction_features) \n",
"\n",
"# view final feature set\n",
"prediction_features"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Name | \n",
" OverallGrade | \n",
" Obedient | \n",
" ResearchScore | \n",
" ProjectScore | \n",
" Recommend | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Nathan | \n",
" F | \n",
" N | \n",
" 30 | \n",
" 20 | \n",
" No | \n",
"
\n",
" \n",
" 1 | \n",
" Thomas | \n",
" A | \n",
" Y | \n",
" 78 | \n",
" 80 | \n",
" Yes | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Name OverallGrade Obedient ResearchScore ProjectScore Recommend\n",
"0 Nathan F N 30 20 No\n",
"1 Thomas A Y 78 80 Yes"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## predict using model\n",
"predictions = model.predict(prediction_features)\n",
"\n",
"## display results\n",
"new_data['Recommend'] = predictions\n",
"new_data"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}