{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.19.1\n", "0.20.3\n" ] } ], "source": [ "import pandas as pd\n", "import os\n", "import sklearn\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.ensemble import RandomForestClassifier\n", "from xgboost import XGBClassifier\n", "from mlxtend.classifier import StackingClassifier\n", "from sklearn.cross_validation import train_test_split\n", "\n", "print sklearn.__version__\n", "print pd.__version__" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>ID</th>\n", " <th>JobFamily</th>\n", " <th>JobFamilyDescription</th>\n", " <th>JobClass</th>\n", " <th>JobClassDescription</th>\n", " <th>PayGrade</th>\n", " <th>EducationLevel</th>\n", " <th>Experience</th>\n", " <th>OrgImpact</th>\n", " <th>ProblemSolving</th>\n", " <th>Supervision</th>\n", " <th>ContactLevel</th>\n", " <th>FinancialBudget</th>\n", " <th>PG</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>Accounting And Finance</td>\n", " <td>1</td>\n", " <td>Accountant I</td>\n", " <td>5</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>3</td>\n", " <td>5</td>\n", " <td>PG05</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2</td>\n", " <td>1</td>\n", " <td>Accounting And Finance</td>\n", " <td>2</td>\n", " <td>Accountant II</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>1</td>\n", " <td>5</td>\n", " <td>4</td>\n", " <td>5</td>\n", " <td>7</td>\n", " <td>7</td>\n", " <td>PG06</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>Accounting And Finance</td>\n", " <td>3</td>\n", " <td>Accountant III</td>\n", " <td>8</td>\n", " <td>4</td>\n", " <td>2</td>\n", " <td>6</td>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>7</td>\n", " <td>10</td>\n", " <td>PG08</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>4</td>\n", " <td>1</td>\n", " <td>Accounting And Finance</td>\n", " <td>4</td>\n", " <td>Accountant IV</td>\n", " <td>10</td>\n", " <td>5</td>\n", " <td>5</td>\n", " <td>6</td>\n", " <td>6</td>\n", " <td>7</td>\n", " <td>8</td>\n", " <td>11</td>\n", " <td>PG10</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>5</td>\n", " <td>2</td>\n", " <td>Administrative Support</td>\n", " <td>5</td>\n", " <td>Admin Support I</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>PG01</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " ID JobFamily JobFamilyDescription JobClass JobClassDescription \\\n", "0 1 1 Accounting And Finance 1 Accountant I \n", "1 2 1 Accounting And Finance 2 Accountant II \n", "2 3 1 Accounting And Finance 3 Accountant III \n", "3 4 1 Accounting And Finance 4 Accountant IV \n", "4 5 2 Administrative Support 5 Admin Support I \n", "\n", " PayGrade EducationLevel Experience OrgImpact ProblemSolving \\\n", "0 5 3 1 3 3 \n", "1 6 4 1 5 4 \n", "2 8 4 2 6 5 \n", "3 10 5 5 6 6 \n", "4 1 1 0 1 1 \n", "\n", " Supervision ContactLevel FinancialBudget PG \n", "0 4 3 5 PG05 \n", "1 5 7 7 PG06 \n", "2 6 7 10 PG08 \n", "3 7 8 11 PG10 \n", "4 1 1 1 PG01 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# reading data to df\n", "DATA_DIR = '../data'\n", "\n", "df = pd.read_csv(os.path.abspath(os.path.join(DATA_DIR, 'day15/jobclass.csv')))\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# target is to predict the column PG using rest (all to one)\n", "target = df['PG']\n", "\n", "# dropping unnecessary columns and keeping just the concerned features\n", "df.drop(['ID', 'JobFamilyDescription', 'JobClassDescription', 'PG'], axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>JobFamily</th>\n", " <th>JobClass</th>\n", " <th>PayGrade</th>\n", " <th>EducationLevel</th>\n", " <th>Experience</th>\n", " <th>OrgImpact</th>\n", " <th>ProblemSolving</th>\n", " <th>Supervision</th>\n", " <th>ContactLevel</th>\n", " <th>FinancialBudget</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>5</td>\n", " <td>3</td>\n", " <td>1</td>\n", " <td>3</td>\n", " <td>3</td>\n", " <td>4</td>\n", " <td>3</td>\n", " <td>5</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>6</td>\n", " <td>4</td>\n", " <td>1</td>\n", " <td>5</td>\n", " <td>4</td>\n", " <td>5</td>\n", " <td>7</td>\n", " <td>7</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " JobFamily JobClass PayGrade EducationLevel Experience OrgImpact \\\n", "0 1 1 5 3 1 3 \n", "1 1 2 6 4 1 5 \n", "\n", " ProblemSolving Supervision ContactLevel FinancialBudget \n", "0 3 4 3 5 \n", "1 4 5 7 7 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(2)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "JobFamily 0\n", "JobClass 0\n", "PayGrade 0\n", "EducationLevel 0\n", "Experience 0\n", "OrgImpact 0\n", "ProblemSolving 0\n", "Supervision 0\n", "ContactLevel 0\n", "FinancialBudget 0\n", "dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check for NaN (Missing values)\n", "df.isnull().sum()\n", "\n", "# Luckily not a single missing values" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "X = df[:].values\n", "Y = target.values" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# ideal practice is to use test as 20% - 30% of training data\n", "# defined by test_size in train_test_split()\n", "# random_state is required to avoid sequential biasness in the data distribution\n", "def data_split(X, Y):\n", " X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state = 10)\n", " return X_train, X_test, Y_train, Y_test\n", "\n", "X_train, X_test, Y_train, Y_test = data_split(X, Y)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Model - 1 [Random Forest]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "clf1 = RandomForestClassifier(random_state=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Model - 2 [XGBoost]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "clf2 = XGBClassifier()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Meta Model [Logit]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "lr = LogisticRegression()\n", "sclf = StackingClassifier(\n", " classifiers = [clf1, clf2], \n", " meta_classifier=lr,\n", " use_probas=True,\n", " average_probas=False,\n", " )" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5-fold cross validation:\n", "\n", "Accuracy: 0.70 (+/- 0.10) [Random Forest]\n", "Accuracy: 0.92 (+/- 0.09) [XGBoost]\n", "Accuracy: 0.86 (+/- 0.14) [StackingClassifier]\n" ] } ], "source": [ "print('5-fold cross validation:\\n')\n", "\n", "for clf, label in zip([clf1, clf2, sclf], \n", " ['Random Forest', \n", " 'XGBoost',\n", " 'StackingClassifier']):\n", "\n", " scores = sklearn.model_selection.cross_val_score(clf, X_train, Y_train, \n", " cv=5, scoring='accuracy')\n", " print(\"Accuracy: %0.2f (+/- %0.2f) [%s]\" \n", " % (scores.mean(), scores.std(), label))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }