{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Data retrieval" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameOverallGradeObedientResearchScoreProjectScoreRecommend
0HenryAY9085Yes
1JohnCN8551Yes
2DavidFN1017No
3HolmesBY7571No
4MarvinEN2030No
5SimonAY9279Yes
6RobertBY6059No
7TrentCY7533No
\n", "
" ], "text/plain": [ " Name OverallGrade Obedient ResearchScore ProjectScore Recommend\n", "0 Henry A Y 90 85 Yes\n", "1 John C N 85 51 Yes\n", "2 David F N 10 17 No\n", "3 Holmes B Y 75 71 No\n", "4 Marvin E N 20 30 No\n", "5 Simon A Y 92 79 Yes\n", "6 Robert B Y 60 59 No\n", "7 Trent C Y 75 33 No" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "# turn of warning messages\n", "pd.options.mode.chained_assignment = None # default='warn'\n", "\n", "# get data\n", "df = pd.read_csv('student_records.csv')\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data preparation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Feature extraction and engineering" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# get features and corresponding outcomes\n", "feature_names = ['OverallGrade', 'Obedient', 'ResearchScore', 'ProjectScore']\n", "training_features = df[feature_names]\n", "\n", "outcome_name = ['Recommend']\n", "outcome_labels = df[outcome_name]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OverallGradeObedientResearchScoreProjectScore
0AY9085
1CN8551
2FN1017
3BY7571
4EN2030
5AY9279
6BY6059
7CY7533
\n", "
" ], "text/plain": [ " OverallGrade Obedient ResearchScore ProjectScore\n", "0 A Y 90 85\n", "1 C N 85 51\n", "2 F N 10 17\n", "3 B Y 75 71\n", "4 E N 20 30\n", "5 A Y 92 79\n", "6 B Y 60 59\n", "7 C Y 75 33" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# view features\n", "training_features" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Recommend
0Yes
1Yes
2No
3No
4No
5Yes
6No
7No
\n", "
" ], "text/plain": [ " Recommend\n", "0 Yes\n", "1 Yes\n", "2 No\n", "3 No\n", "4 No\n", "5 Yes\n", "6 No\n", "7 No" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# view outcome labels\n", "outcome_labels" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# list down features based on type\n", "numeric_feature_names = ['ResearchScore', 'ProjectScore']\n", "categoricial_feature_names = ['OverallGrade', 'Obedient']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Numeric Feature Scaling" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OverallGradeObedientResearchScoreProjectScore
0AY0.8995831.376650
1CN0.730648-0.091777
2FN-1.803390-1.560203
3BY0.3927760.772004
4EN-1.465519-0.998746
5AY0.9671581.117516
6BY-0.1140320.253735
7CY0.392776-0.869179
\n", "
" ], "text/plain": [ " OverallGrade Obedient ResearchScore ProjectScore\n", "0 A Y 0.899583 1.376650\n", "1 C N 0.730648 -0.091777\n", "2 F N -1.803390 -1.560203\n", "3 B Y 0.392776 0.772004\n", "4 E N -1.465519 -0.998746\n", "5 A Y 0.967158 1.117516\n", "6 B Y -0.114032 0.253735\n", "7 C Y 0.392776 -0.869179" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.preprocessing import StandardScaler\n", "ss = StandardScaler()\n", "\n", "# fit scaler on numeric features\n", "ss.fit(training_features[numeric_feature_names])\n", "\n", "# scale numeric features now\n", "training_features[numeric_feature_names] = ss.transform(training_features[numeric_feature_names])\n", "\n", "# view updated featureset\n", "training_features" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Engineering Categorical Features" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ResearchScoreProjectScoreOverallGrade_AOverallGrade_BOverallGrade_COverallGrade_EOverallGrade_FObedient_NObedient_Y
00.8995831.3766501000001
10.730648-0.0917770010010
2-1.803390-1.5602030000110
30.3927760.7720040100001
4-1.465519-0.9987460001010
50.9671581.1175161000001
6-0.1140320.2537350100001
70.392776-0.8691790010001
\n", "
" ], "text/plain": [ " ResearchScore ProjectScore OverallGrade_A OverallGrade_B \\\n", "0 0.899583 1.376650 1 0 \n", "1 0.730648 -0.091777 0 0 \n", "2 -1.803390 -1.560203 0 0 \n", "3 0.392776 0.772004 0 1 \n", "4 -1.465519 -0.998746 0 0 \n", "5 0.967158 1.117516 1 0 \n", "6 -0.114032 0.253735 0 1 \n", "7 0.392776 -0.869179 0 0 \n", "\n", " OverallGrade_C OverallGrade_E OverallGrade_F Obedient_N Obedient_Y \n", "0 0 0 0 0 1 \n", "1 1 0 0 1 0 \n", "2 0 0 1 1 0 \n", "3 0 0 0 0 1 \n", "4 0 1 0 1 0 \n", "5 0 0 0 0 1 \n", "6 0 0 0 0 1 \n", "7 1 0 0 0 1 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "training_features = pd.get_dummies(training_features, columns=categoricial_feature_names)\n", "# view newly engineering features\n", "training_features" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# get list of new categorical features\n", "categorical_engineered_features = list(set(training_features.columns) - set(numeric_feature_names))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Modeling" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", " verbose=0, warm_start=False)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.linear_model import LogisticRegression\n", "import numpy as np\n", "\n", "# fit the model\n", "lr = LogisticRegression() \n", "model = lr.fit(training_features, np.array(outcome_labels['Recommend']))\n", "# view model parameters\n", "model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Model Evaluation" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 100.0 %\n", "Classification Stats:\n", " precision recall f1-score support\n", "\n", " No 1.00 1.00 1.00 5\n", " Yes 1.00 1.00 1.00 3\n", "\n", "avg / total 1.00 1.00 1.00 8\n", "\n" ] } ], "source": [ "# simple evaluation on training data\n", "pred_labels = model.predict(training_features)\n", "actual_labels = np.array(outcome_labels['Recommend'])\n", "\n", "# evaluate model performance\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import classification_report\n", "\n", "print('Accuracy:', float(accuracy_score(actual_labels, pred_labels))*100, '%')\n", "print('Classification Stats:')\n", "print(classification_report(actual_labels, pred_labels))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Model Deployment " ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Scaler/scaler.pickle',\n", " 'Scaler/scaler.pickle_01.npy',\n", " 'Scaler/scaler.pickle_02.npy',\n", " 'Scaler/scaler.pickle_03.npy']" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.externals import joblib\n", "import os\n", "# save models to be deployed on your server\n", "if not os.path.exists('Model'):\n", " os.mkdir('Model')\n", "if not os.path.exists('Scaler'):\n", " os.mkdir('Scaler') \n", " \n", "joblib.dump(model, r'Model/model.pickle') \n", "joblib.dump(ss, r'Scaler/scaler.pickle') " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prediction in Action" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# load model and scaler objects\n", "model = joblib.load(r'Model/model.pickle')\n", "scaler = joblib.load(r'Scaler/scaler.pickle')" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameOverallGradeObedientResearchScoreProjectScore
0NathanFN3020
1ThomasAY7880
\n", "
" ], "text/plain": [ " Name OverallGrade Obedient ResearchScore ProjectScore\n", "0 Nathan F N 30 20\n", "1 Thomas A Y 78 80" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## data retrieval\n", "new_data = pd.DataFrame([{'Name': 'Nathan', 'OverallGrade': 'F', 'Obedient': 'N', 'ResearchScore': 30, 'ProjectScore': 20},\n", " {'Name': 'Thomas', 'OverallGrade': 'A', 'Obedient': 'Y', 'ResearchScore': 78, 'ProjectScore': 80}])\n", "new_data = new_data[['Name', 'OverallGrade', 'Obedient', 'ResearchScore', 'ProjectScore']]\n", "new_data" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ResearchScoreProjectScoreOverallGrade_AOverallGrade_FObedient_NObedient_Y
0-1.127647-1.4306360110
10.4941371.1607051001
\n", "
" ], "text/plain": [ " ResearchScore ProjectScore OverallGrade_A OverallGrade_F Obedient_N \\\n", "0 -1.127647 -1.430636 0 1 1 \n", "1 0.494137 1.160705 1 0 0 \n", "\n", " Obedient_Y \n", "0 0 \n", "1 1 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## data preparation\n", "prediction_features = new_data[feature_names]\n", "\n", "# scaling\n", "prediction_features[numeric_feature_names] = scaler.transform(prediction_features[numeric_feature_names])\n", "\n", "# engineering categorical variables\n", "prediction_features = pd.get_dummies(prediction_features, columns=categoricial_feature_names)\n", "\n", "# view feature set\n", "prediction_features" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ResearchScoreProjectScoreOverallGrade_AOverallGrade_FObedient_NObedient_YOverallGrade_BOverallGrade_EOverallGrade_C
0-1.127647-1.4306360110000
10.4941371.1607051001000
\n", "
" ], "text/plain": [ " ResearchScore ProjectScore OverallGrade_A OverallGrade_F Obedient_N \\\n", "0 -1.127647 -1.430636 0 1 1 \n", "1 0.494137 1.160705 1 0 0 \n", "\n", " Obedient_Y OverallGrade_B OverallGrade_E OverallGrade_C \n", "0 0 0 0 0 \n", "1 1 0 0 0 " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# add missing categorical feature columns\n", "current_categorical_engineered_features = set(prediction_features.columns) - set(numeric_feature_names)\n", "missing_features = set(categorical_engineered_features) - current_categorical_engineered_features\n", "for feature in missing_features:\n", " # add zeros since feature is absent in these data samples\n", " prediction_features[feature] = [0] * len(prediction_features) \n", "\n", "# view final feature set\n", "prediction_features" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameOverallGradeObedientResearchScoreProjectScoreRecommend
0NathanFN3020No
1ThomasAY7880Yes
\n", "
" ], "text/plain": [ " Name OverallGrade Obedient ResearchScore ProjectScore Recommend\n", "0 Nathan F N 30 20 No\n", "1 Thomas A Y 78 80 Yes" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "## predict using model\n", "predictions = model.predict(prediction_features)\n", "\n", "## display results\n", "new_data['Recommend'] = predictions\n", "new_data" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }