{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "- **Author:** 马肖\n", "- **E-Mail:** maxiaoscut@aliyun.com\n", "- **GitHub:** https://github.com/Albertsr" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from scipy.sparse import hstack\n", "from sklearn.datasets import make_classification\n", "from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import roc_auc_score, accuracy_score, f1_score\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import OneHotEncoder\n", "from xgboost import XGBClassifier\n", "from lightgbm import LGBMClassifier\n", "\n", "X, y = make_classification(n_samples=10000, n_features=20, n_informative=18, n_redundant=2,\n", " n_classes=2, n_clusters_per_class=3, random_state=2017)\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "clf_gbdt = GradientBoostingClassifier(n_estimators=50)\n", "clf_xgb = XGBClassifier(n_estimators=50)\n", "clf_lgb = LGBMClassifier(n_estimators=50)\n", "lr = LogisticRegression(max_iter=500, solver='lbfgs')\n", "\n", "models = [clf_gbdt, clf_xgb, clf_lgb]\n", "names = ['GBDT', 'XGBoost', 'LightGBM']\n", "\n", "metric_scores = []\n", "for model,name in zip(models, names):\n", " model.fit(X_train, y_train)\n", " y_pred = model.predict(X_test)\n", " y_pred_prob = model.predict_proba(X_test)[:, 1]\n", " acc = accuracy_score(y_test, y_pred)\n", " auc = roc_auc_score(y_test, y_pred_prob)\n", " fscore = f1_score(y_test, y_pred)\n", "\n", " if name == 'GBDT':\n", " X_train_leaves = model.apply(X_train)[:, :, 0]\n", " X_test_leaves = model.apply(X_test)[:, :, 0]\n", " \n", " elif name == 'LightGBM':\n", " X_train_leaves = model.predict(X_train, pred_leaf=True)\n", " X_test_leaves = model.predict(X_test, pred_leaf=True)\n", " else:\n", " X_train_leaves = model.apply(X_train)\n", " X_test_leaves = model.apply(X_test)\n", "\n", " \n", " All_leaves = np.r_[X_train_leaves, X_test_leaves]\n", " All_leaves = All_leaves.astype(np.int32)\n", "\n", " enc = OneHotEncoder(categories='auto')\n", " X_new_feat = enc.fit_transform(All_leaves)\n", " \n", " train_samples = X_train_leaves.shape[0]\n", " X_train_new = X_new_feat[:train_samples, :]\n", " X_test_new = X_new_feat[train_samples:, :]\n", "\n", " X_train_hstack = hstack([X_train_new, X_train])\n", " X_test_hstack = hstack([X_test_new, X_test])\n", "\n", " lr.fit(X_train_hstack, y_train)\n", " y_pred_2 = lr.predict(X_test_hstack)\n", " y_pred_prob_2 = lr.predict_proba(X_test_hstack)[:, 1]\n", "\n", " new_acc = accuracy_score(y_test, y_pred_2)\n", " new_auc = roc_auc_score(y_test, y_pred_prob_2)\n", " new_fscore = f1_score(y_test, y_pred_2)\n", " score = {'OriginalFeature':[fscore, acc, auc], 'NewFeature':[ new_fscore, new_acc, new_auc]}\n", " result = pd.DataFrame(score)\n", " metric_scores.append(result)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | \n", " | OriginalFeature | \n", "NewFeature | \n", "
|---|---|---|---|
| GBDT + LR | \n", "F1 | \n", "0.841070 | \n", "0.875536 | \n", "
| ACC | \n", "0.838400 | \n", "0.872400 | \n", "|
| AUC | \n", "0.925139 | \n", "0.946116 | \n", "|
| XGBoost + LR | \n", "F1 | \n", "0.837136 | \n", "0.872116 | \n", "
| ACC | \n", "0.834400 | \n", "0.869200 | \n", "|
| AUC | \n", "0.921574 | \n", "0.943909 | \n", "|
| LightGBM + LR | \n", "F1 | \n", "0.910658 | \n", "0.921269 | \n", "
| ACC | \n", "0.908800 | \n", "0.919600 | \n", "|
| AUC | \n", "0.969011 | \n", "0.971790 | \n", "