{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- **Author：** 马肖\n",
    "- **E-Mail：** maxiaoscut@aliyun.com\n",
    "- **GitHub：**  https://github.com/Albertsr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from scipy.sparse import hstack\n",
    "from sklearn.datasets import make_classification\n",
    "from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import roc_auc_score, accuracy_score, f1_score\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from xgboost import XGBClassifier\n",
    "from lightgbm import LGBMClassifier\n",
    "\n",
    "X, y  = make_classification(n_samples=10000, n_features=20, n_informative=18, n_redundant=2,\n",
    "                            n_classes=2, n_clusters_per_class=3, random_state=2017)\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf_gbdt = GradientBoostingClassifier(n_estimators=50)\n",
    "clf_xgb = XGBClassifier(n_estimators=50)\n",
    "clf_lgb = LGBMClassifier(n_estimators=50)\n",
    "lr = LogisticRegression(max_iter=500, solver='lbfgs')\n",
    "\n",
    "models = [clf_gbdt, clf_xgb, clf_lgb]\n",
    "names = ['GBDT', 'XGBoost', 'LightGBM']\n",
    "\n",
    "metric_scores = []\n",
    "for model,name in zip(models, names):\n",
    "    model.fit(X_train, y_train)\n",
    "    y_pred = model.predict(X_test)\n",
    "    y_pred_prob = model.predict_proba(X_test)[:, 1]\n",
    "    acc = accuracy_score(y_test, y_pred)\n",
    "    auc = roc_auc_score(y_test, y_pred_prob)\n",
    "    fscore = f1_score(y_test, y_pred)\n",
    "\n",
    "    if name == 'GBDT':\n",
    "        X_train_leaves = model.apply(X_train)[:, :, 0]\n",
    "        X_test_leaves = model.apply(X_test)[:, :, 0]\n",
    "        \n",
    "    elif name == 'LightGBM':\n",
    "        X_train_leaves = model.predict(X_train, pred_leaf=True)\n",
    "        X_test_leaves = model.predict(X_test, pred_leaf=True)\n",
    "    else:\n",
    "        X_train_leaves = model.apply(X_train)\n",
    "        X_test_leaves = model.apply(X_test)\n",
    "\n",
    "    \n",
    "    All_leaves = np.r_[X_train_leaves, X_test_leaves]\n",
    "    All_leaves = All_leaves.astype(np.int32)\n",
    "\n",
    "    enc = OneHotEncoder(categories='auto')\n",
    "    X_new_feat = enc.fit_transform(All_leaves)\n",
    "    \n",
    "    train_samples = X_train_leaves.shape[0]\n",
    "    X_train_new = X_new_feat[:train_samples, :]\n",
    "    X_test_new = X_new_feat[train_samples:, :]\n",
    "\n",
    "    X_train_hstack = hstack([X_train_new, X_train])\n",
    "    X_test_hstack = hstack([X_test_new, X_test])\n",
    "\n",
    "    lr.fit(X_train_hstack, y_train)\n",
    "    y_pred_2 = lr.predict(X_test_hstack)\n",
    "    y_pred_prob_2 = lr.predict_proba(X_test_hstack)[:, 1]\n",
    "\n",
    "    new_acc = accuracy_score(y_test, y_pred_2)\n",
    "    new_auc = roc_auc_score(y_test, y_pred_prob_2)\n",
    "    new_fscore = f1_score(y_test, y_pred_2)\n",
    "    score = {'OriginalFeature':[fscore, acc, auc], 'NewFeature':[ new_fscore, new_acc, new_auc]}\n",
    "    result = pd.DataFrame(score)\n",
    "    metric_scores.append(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>OriginalFeature</th>\n",
       "      <th>NewFeature</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">GBDT + LR</th>\n",
       "      <th>F1</th>\n",
       "      <td>0.841070</td>\n",
       "      <td>0.875536</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ACC</th>\n",
       "      <td>0.838400</td>\n",
       "      <td>0.872400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AUC</th>\n",
       "      <td>0.925139</td>\n",
       "      <td>0.946116</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">XGBoost + LR</th>\n",
       "      <th>F1</th>\n",
       "      <td>0.837136</td>\n",
       "      <td>0.872116</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ACC</th>\n",
       "      <td>0.834400</td>\n",
       "      <td>0.869200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AUC</th>\n",
       "      <td>0.921574</td>\n",
       "      <td>0.943909</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"3\" valign=\"top\">LightGBM + LR</th>\n",
       "      <th>F1</th>\n",
       "      <td>0.910658</td>\n",
       "      <td>0.921269</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ACC</th>\n",
       "      <td>0.908800</td>\n",
       "      <td>0.919600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AUC</th>\n",
       "      <td>0.969011</td>\n",
       "      <td>0.971790</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   OriginalFeature  NewFeature\n",
       "GBDT + LR     F1          0.841070    0.875536\n",
       "              ACC         0.838400    0.872400\n",
       "              AUC         0.925139    0.946116\n",
       "XGBoost + LR  F1          0.837136    0.872116\n",
       "              ACC         0.834400    0.869200\n",
       "              AUC         0.921574    0.943909\n",
       "LightGBM + LR F1          0.910658    0.921269\n",
       "              ACC         0.908800    0.919600\n",
       "              AUC         0.969011    0.971790"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model_names = ['GBDT + LR', 'XGBoost + LR', 'LightGBM + LR']   \n",
    "model_metrics = ['F1', 'ACC', 'AUC']\n",
    "col_idx = pd.MultiIndex.from_product([model_names, model_metrics])\n",
    "df_contrast = pd.concat(metric_scores, axis=0)\n",
    "df_contrast.index = col_idx   \n",
    "df_contrast"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}