- **Author:** 马肖
- **E-Mail:** maxiaoscut@aliyun.com
- **GitHub:** https://github.com/Albertsr

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.datasets import make_classification
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

X, y = make_classification(n_samples=10000, n_features=20, n_informative=18, n_redundant=2,
 n_classes=2, n_clusters_per_class=3, random_state=2017)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [2]:
clf_gbdt = GradientBoostingClassifier(n_estimators=50)
clf_xgb = XGBClassifier(n_estimators=50)
clf_lgb = LGBMClassifier(n_estimators=50)
lr = LogisticRegression(max_iter=500, solver='lbfgs')

models = [clf_gbdt, clf_xgb, clf_lgb]
names = ['GBDT', 'XGBoost', 'LightGBM']

metric_scores = []
for model,name in zip(models, names):
 model.fit(X_train, y_train)
 y_pred = model.predict(X_test)
 y_pred_prob = model.predict_proba(X_test)[:, 1]
 acc = accuracy_score(y_test, y_pred)
 auc = roc_auc_score(y_test, y_pred_prob)
 fscore = f1_score(y_test, y_pred)

 if name == 'GBDT':
 X_train_leaves = model.apply(X_train)[:, :, 0]
 X_test_leaves = model.apply(X_test)[:, :, 0]
 
 elif name == 'LightGBM':
 X_train_leaves = model.predict(X_train, pred_leaf=True)
 X_test_leaves = model.predict(X_test, pred_leaf=True)
 else:
 X_train_leaves = model.apply(X_train)
 X_test_leaves = model.apply(X_test)

 
 All_leaves = np.r_[X_train_leaves, X_test_leaves]
 All_leaves = All_leaves.astype(np.int32)

 enc = OneHotEncoder(categories='auto')
 X_new_feat = enc.fit_transform(All_leaves)
 
 train_samples = X_train_leaves.shape[0]
 X_train_new = X_new_feat[:train_samples, :]
 X_test_new = X_new_feat[train_samples:, :]

 X_train_hstack = hstack([X_train_new, X_train])
 X_test_hstack = hstack([X_test_new, X_test])

 lr.fit(X_train_hstack, y_train)
 y_pred_2 = lr.predict(X_test_hstack)
 y_pred_prob_2 = lr.predict_proba(X_test_hstack)[:, 1]

 new_acc = accuracy_score(y_test, y_pred_2)
 new_auc = roc_auc_score(y_test, y_pred_prob_2)
 new_fscore = f1_score(y_test, y_pred_2)
 score = {'OriginalFeature':[fscore, acc, auc], 'NewFeature':[ new_fscore, new_acc, new_auc]}
 result = pd.DataFrame(score)
 metric_scores.append(result)

In [3]:
model_names = ['GBDT + LR', 'XGBoost + LR', 'LightGBM + LR'] 
model_metrics = ['F1', 'ACC', 'AUC']
col_idx = pd.MultiIndex.from_product([model_names, model_metrics])
df_contrast = pd.concat(metric_scores, axis=0)
df_contrast.index = col_idx 
df_contrast

Unnamed: 0,Unnamed: 1,OriginalFeature,NewFeature
GBDT + LR,F1,0.84107,0.875536
GBDT + LR,ACC,0.8384,0.8724
GBDT + LR,AUC,0.925139,0.946116
XGBoost + LR,F1,0.837136,0.872116
XGBoost + LR,ACC,0.8344,0.8692
XGBoost + LR,AUC,0.921574,0.943909
LightGBM + LR,F1,0.910658,0.921269
LightGBM + LR,ACC,0.9088,0.9196
LightGBM + LR,AUC,0.969011,0.97179
