{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.19.1\n",
      "0.20.3\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "import sklearn\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from xgboost import XGBClassifier\n",
    "from mlxtend.classifier import StackingClassifier\n",
    "from sklearn.cross_validation import train_test_split\n",
    "\n",
    "print sklearn.__version__\n",
    "print pd.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>JobFamily</th>\n",
       "      <th>JobFamilyDescription</th>\n",
       "      <th>JobClass</th>\n",
       "      <th>JobClassDescription</th>\n",
       "      <th>PayGrade</th>\n",
       "      <th>EducationLevel</th>\n",
       "      <th>Experience</th>\n",
       "      <th>OrgImpact</th>\n",
       "      <th>ProblemSolving</th>\n",
       "      <th>Supervision</th>\n",
       "      <th>ContactLevel</th>\n",
       "      <th>FinancialBudget</th>\n",
       "      <th>PG</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Accounting And Finance</td>\n",
       "      <td>1</td>\n",
       "      <td>Accountant I</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>PG05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>Accounting And Finance</td>\n",
       "      <td>2</td>\n",
       "      <td>Accountant II</td>\n",
       "      <td>6</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>7</td>\n",
       "      <td>7</td>\n",
       "      <td>PG06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>Accounting And Finance</td>\n",
       "      <td>3</td>\n",
       "      <td>Accountant III</td>\n",
       "      <td>8</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>10</td>\n",
       "      <td>PG08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>Accounting And Finance</td>\n",
       "      <td>4</td>\n",
       "      <td>Accountant IV</td>\n",
       "      <td>10</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>11</td>\n",
       "      <td>PG10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>2</td>\n",
       "      <td>Administrative Support</td>\n",
       "      <td>5</td>\n",
       "      <td>Admin Support I</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>PG01</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ID  JobFamily    JobFamilyDescription  JobClass JobClassDescription  \\\n",
       "0   1          1  Accounting And Finance         1        Accountant I   \n",
       "1   2          1  Accounting And Finance         2       Accountant II   \n",
       "2   3          1  Accounting And Finance         3      Accountant III   \n",
       "3   4          1  Accounting And Finance         4       Accountant IV   \n",
       "4   5          2  Administrative Support         5     Admin Support I   \n",
       "\n",
       "   PayGrade  EducationLevel  Experience  OrgImpact  ProblemSolving  \\\n",
       "0         5               3           1          3               3   \n",
       "1         6               4           1          5               4   \n",
       "2         8               4           2          6               5   \n",
       "3        10               5           5          6               6   \n",
       "4         1               1           0          1               1   \n",
       "\n",
       "   Supervision  ContactLevel  FinancialBudget    PG  \n",
       "0            4             3                5  PG05  \n",
       "1            5             7                7  PG06  \n",
       "2            6             7               10  PG08  \n",
       "3            7             8               11  PG10  \n",
       "4            1             1                1  PG01  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# reading data to df\n",
    "DATA_DIR = '../data'\n",
    "\n",
    "df = pd.read_csv(os.path.abspath(os.path.join(DATA_DIR, 'day15/jobclass.csv')))\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# target is to predict the column PG using rest (all to one)\n",
    "target = df['PG']\n",
    "\n",
    "# dropping unnecessary columns and keeping just the concerned features\n",
    "df.drop(['ID', 'JobFamilyDescription', 'JobClassDescription', 'PG'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>JobFamily</th>\n",
       "      <th>JobClass</th>\n",
       "      <th>PayGrade</th>\n",
       "      <th>EducationLevel</th>\n",
       "      <th>Experience</th>\n",
       "      <th>OrgImpact</th>\n",
       "      <th>ProblemSolving</th>\n",
       "      <th>Supervision</th>\n",
       "      <th>ContactLevel</th>\n",
       "      <th>FinancialBudget</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>7</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   JobFamily  JobClass  PayGrade  EducationLevel  Experience  OrgImpact  \\\n",
       "0          1         1         5               3           1          3   \n",
       "1          1         2         6               4           1          5   \n",
       "\n",
       "   ProblemSolving  Supervision  ContactLevel  FinancialBudget  \n",
       "0               3            4             3                5  \n",
       "1               4            5             7                7  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "JobFamily          0\n",
       "JobClass           0\n",
       "PayGrade           0\n",
       "EducationLevel     0\n",
       "Experience         0\n",
       "OrgImpact          0\n",
       "ProblemSolving     0\n",
       "Supervision        0\n",
       "ContactLevel       0\n",
       "FinancialBudget    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check for NaN (Missing values)\n",
    "df.isnull().sum()\n",
    "\n",
    "# Luckily not a single missing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X = df[:].values\n",
    "Y = target.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# ideal practice is to use test as 20% - 30% of training data\n",
    "# defined by test_size in train_test_split()\n",
    "# random_state is required to avoid sequential biasness in the data distribution\n",
    "def data_split(X, Y):\n",
    "    X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state = 10)\n",
    "    return X_train, X_test, Y_train, Y_test\n",
    "\n",
    "X_train, X_test, Y_train, Y_test = data_split(X, Y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model - 1 [Random Forest]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "clf1 = RandomForestClassifier(random_state=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model - 2 [XGBoost]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "clf2 = XGBClassifier()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Meta Model [Logit]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "lr = LogisticRegression()\n",
    "sclf = StackingClassifier(\n",
    "                          classifiers = [clf1, clf2], \n",
    "                          meta_classifier=lr,\n",
    "                          use_probas=True,\n",
    "                          average_probas=False,\n",
    "                         )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5-fold cross validation:\n",
      "\n",
      "Accuracy: 0.70 (+/- 0.10) [Random Forest]\n",
      "Accuracy: 0.92 (+/- 0.09) [XGBoost]\n",
      "Accuracy: 0.86 (+/- 0.14) [StackingClassifier]\n"
     ]
    }
   ],
   "source": [
    "print('5-fold cross validation:\\n')\n",
    "\n",
    "for clf, label in zip([clf1, clf2, sclf], \n",
    "                      ['Random Forest', \n",
    "                       'XGBoost',\n",
    "                       'StackingClassifier']):\n",
    "\n",
    "    scores = sklearn.model_selection.cross_val_score(clf, X_train, Y_train, \n",
    "                                              cv=5, scoring='accuracy')\n",
    "    print(\"Accuracy: %0.2f (+/- %0.2f) [%s]\" \n",
    "          % (scores.mean(), scores.std(), label))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}