{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "- **Author:** 马肖\n", "- **E-Mail:** maxiaoscut@aliyun.com\n", "- **GitHub:** https://github.com/Albertsr" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from scipy.sparse import hstack\n", "from sklearn.datasets import make_classification\n", "from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import roc_auc_score, accuracy_score, f1_score\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import OneHotEncoder\n", "from xgboost import XGBClassifier\n", "from lightgbm import LGBMClassifier\n", "\n", "X, y = make_classification(n_samples=10000, n_features=20, n_informative=18, n_redundant=2,\n", " n_classes=2, n_clusters_per_class=3, random_state=2017)\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "clf_gbdt = GradientBoostingClassifier(n_estimators=50)\n", "clf_xgb = XGBClassifier(n_estimators=50)\n", "clf_lgb = LGBMClassifier(n_estimators=50)\n", "lr = LogisticRegression(max_iter=500, solver='lbfgs')\n", "\n", "models = [clf_gbdt, clf_xgb, clf_lgb]\n", "names = ['GBDT', 'XGBoost', 'LightGBM']\n", "\n", "metric_scores = []\n", "for model,name in zip(models, names):\n", " model.fit(X_train, y_train)\n", " y_pred = model.predict(X_test)\n", " y_pred_prob = model.predict_proba(X_test)[:, 1]\n", " acc = accuracy_score(y_test, y_pred)\n", " auc = roc_auc_score(y_test, y_pred_prob)\n", " fscore = f1_score(y_test, y_pred)\n", "\n", " if name == 'GBDT':\n", " X_train_leaves = model.apply(X_train)[:, :, 0]\n", " X_test_leaves = model.apply(X_test)[:, :, 0]\n", " \n", " elif name == 'LightGBM':\n", " X_train_leaves = model.predict(X_train, pred_leaf=True)\n", " X_test_leaves = model.predict(X_test, pred_leaf=True)\n", " else:\n", " X_train_leaves = model.apply(X_train)\n", " X_test_leaves = model.apply(X_test)\n", "\n", " \n", " All_leaves = np.r_[X_train_leaves, X_test_leaves]\n", " All_leaves = All_leaves.astype(np.int32)\n", "\n", " enc = OneHotEncoder(categories='auto')\n", " X_new_feat = enc.fit_transform(All_leaves)\n", " \n", " train_samples = X_train_leaves.shape[0]\n", " X_train_new = X_new_feat[:train_samples, :]\n", " X_test_new = X_new_feat[train_samples:, :]\n", "\n", " X_train_hstack = hstack([X_train_new, X_train])\n", " X_test_hstack = hstack([X_test_new, X_test])\n", "\n", " lr.fit(X_train_hstack, y_train)\n", " y_pred_2 = lr.predict(X_test_hstack)\n", " y_pred_prob_2 = lr.predict_proba(X_test_hstack)[:, 1]\n", "\n", " new_acc = accuracy_score(y_test, y_pred_2)\n", " new_auc = roc_auc_score(y_test, y_pred_prob_2)\n", " new_fscore = f1_score(y_test, y_pred_2)\n", " score = {'OriginalFeature':[fscore, acc, auc], 'NewFeature':[ new_fscore, new_acc, new_auc]}\n", " result = pd.DataFrame(score)\n", " metric_scores.append(result)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th></th>\n", " <th>OriginalFeature</th>\n", " <th>NewFeature</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th rowspan=\"3\" valign=\"top\">GBDT + LR</th>\n", " <th>F1</th>\n", " <td>0.841070</td>\n", " <td>0.875536</td>\n", " </tr>\n", " <tr>\n", " <th>ACC</th>\n", " <td>0.838400</td>\n", " <td>0.872400</td>\n", " </tr>\n", " <tr>\n", " <th>AUC</th>\n", " <td>0.925139</td>\n", " <td>0.946116</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"3\" valign=\"top\">XGBoost + LR</th>\n", " <th>F1</th>\n", " <td>0.837136</td>\n", " <td>0.872116</td>\n", " </tr>\n", " <tr>\n", " <th>ACC</th>\n", " <td>0.834400</td>\n", " <td>0.869200</td>\n", " </tr>\n", " <tr>\n", " <th>AUC</th>\n", " <td>0.921574</td>\n", " <td>0.943909</td>\n", " </tr>\n", " <tr>\n", " <th rowspan=\"3\" valign=\"top\">LightGBM + LR</th>\n", " <th>F1</th>\n", " <td>0.910658</td>\n", " <td>0.921269</td>\n", " </tr>\n", " <tr>\n", " <th>ACC</th>\n", " <td>0.908800</td>\n", " <td>0.919600</td>\n", " </tr>\n", " <tr>\n", " <th>AUC</th>\n", " <td>0.969011</td>\n", " <td>0.971790</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " OriginalFeature NewFeature\n", "GBDT + LR F1 0.841070 0.875536\n", " ACC 0.838400 0.872400\n", " AUC 0.925139 0.946116\n", "XGBoost + LR F1 0.837136 0.872116\n", " ACC 0.834400 0.869200\n", " AUC 0.921574 0.943909\n", "LightGBM + LR F1 0.910658 0.921269\n", " ACC 0.908800 0.919600\n", " AUC 0.969011 0.971790" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model_names = ['GBDT + LR', 'XGBoost + LR', 'LightGBM + LR'] \n", "model_metrics = ['F1', 'ACC', 'AUC']\n", "col_idx = pd.MultiIndex.from_product([model_names, model_metrics])\n", "df_contrast = pd.concat(metric_scores, axis=0)\n", "df_contrast.index = col_idx \n", "df_contrast" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" } }, "nbformat": 4, "nbformat_minor": 2 }