# A New Tree Booster: PART

__12 Feb 2018, marugari__

PART (Peeking Additive Regression Trees) aims to
* optimize non-differential metrics
* avoid over-fitting

For training a PART booster, we need split training data into 3 part.
1. training set: to search optimal splits
2. peeking set: to determin whether a new tree is committed
3. validation set: to get validation score

[Repository (https://github.com/marugari/LightGBM/tree/part)](https://github.com/marugari/LightGBM/tree/part)

[Main contribution (part.hpp)](https://github.com/marugari/LightGBM/blob/part/src/boosting/part.hpp)

This is implemented as a LightGBM custom booster.
The following is a fork of [the Kaggle Zillow Prize Kernel](https://www.kaggle.com/guolinke/simple-lightgbm-starter-lb-0-06487/code).

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import gc

In [2]:
train = pd.read_csv('input/zillow/train_2016_v2.csv', engine='python')
prop = pd.read_csv('input/zillow/properties_2016.csv', engine='python')

In [3]:
for c, dtype in zip(prop.columns, prop.dtypes):	
 if dtype == np.float64:
 prop[c] = prop[c].astype(np.float32)
df_train = train.merge(prop, how='left', on='parcelid')
col = [
 'parcelid',
 'logerror',
 'transactiondate',
 'propertyzoningdesc',
 'propertycountylandusecode'
]
x_train = df_train.drop(col, axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)
train_columns = x_train.columns
for c in x_train.dtypes[x_train.dtypes == object].index.values:
 x_train[c] = (x_train[c] == True)
del df_train

(90275, 55) (90275,)


In [4]:
split = 80000
xt, xv = x_train[:split], x_train[split:]
xt = xt.values.astype(np.float32, copy=False)
xv = xv.values.astype(np.float32, copy=False)
yt, yv = y_train[:split], y_train[split:]
ds_train = lgb.Dataset(xt, label=yt, free_raw_data=False)
ds_valid = lgb.Dataset(xv, label=yv, free_raw_data=False)

In [5]:
prm = {
 'learning_rate': 0.002,
 'boosting_type': 'gbdt',
 'objective': 'regression',
 'metric': 'mae',
 'sub_feature': 0.5,
 'num_leaves': 60,
 'min_data': 500,
 'min_hessian': 1,
}
num_round = 500

In [6]:
clf_gbdt = lgb.train(prm, ds_train, num_round)

In [7]:
prm_part = prm
prm_part['boosting_type'] = 'part'
prm_part['learning_rate'] = 0.002
prm_part['drop_rate'] = 0.0
prm_part['skip_drop'] = 0.0
np.random.seed(20180212)
flg_part = np.random.choice([True, False], len(yt), replace=True, p=[0.7, 0.3])
flg_peek = np.logical_not(flg_part)
ds_part = lgb.Dataset(xt[flg_part], label=yt[flg_part], free_raw_data=False)
ds_peek = lgb.Dataset(xt[flg_peek], label=yt[flg_peek], free_raw_data=False)

In [8]:
clf_part = lgb.train(prm_part, ds_part, num_round, valid_sets=ds_peek)

[1]	valid_0's l1: 0.0683414
[2]	valid_0's l1: 0.0683379
[3]	valid_0's l1: 0.0683343
[4]	valid_0's l1: 0.068331
[5]	valid_0's l1: 0.0683291
[6]	valid_0's l1: 0.0683264
[7]	valid_0's l1: 0.0683249
[8]	valid_0's l1: 0.0683225
[9]	valid_0's l1: 0.06832
[10]	valid_0's l1: 0.0683163
[11]	valid_0's l1: 0.0683139
[12]	valid_0's l1: 0.0683106
[13]	valid_0's l1: 0.0683076
[14]	valid_0's l1: 0.0683049
[15]	valid_0's l1: 0.0683014
[16]	valid_0's l1: 0.0682984
[17]	valid_0's l1: 0.0682964
[18]	valid_0's l1: 0.0682937
[19]	valid_0's l1: 0.0682904
[20]	valid_0's l1: 0.0682873
[21]	valid_0's l1: 0.0682854
[22]	valid_0's l1: 0.0682819
[23]	valid_0's l1: 0.0682799
[24]	valid_0's l1: 0.068277
[25]	valid_0's l1: 0.0682755
[26]	valid_0's l1: 0.0682727
[27]	valid_0's l1: 0.0682709
[28]	valid_0's l1: 0.0682689
[29]	valid_0's l1: 0.068267
[30]	valid_0's l1: 0.0682641
[31]	valid_0's l1: 0.0682614
[32]	valid_0's l1: 0.0682589
[33]	valid_0's l1: 0.0682562
[34]	valid_0's l1: 0.0682533
[35]	valid_0's l1: 0.0682514

[281]	valid_0's l1: 0.0679003
[282]	valid_0's l1: 0.067899
[283]	valid_0's l1: 0.0678986
[284]	valid_0's l1: 0.0678976
[285]	valid_0's l1: 0.0678962
[286]	valid_0's l1: 0.0678956
[287]	valid_0's l1: 0.0678946
[288]	valid_0's l1: 0.0678936
[289]	valid_0's l1: 0.0678932
[290]	valid_0's l1: 0.067892
[291]	valid_0's l1: 0.067891
[292]	valid_0's l1: 0.0678904
[293]	valid_0's l1: 0.0678898
[294]	valid_0's l1: 0.0678894
[295]	valid_0's l1: 0.0678881
[296]	valid_0's l1: 0.0678875
[297]	valid_0's l1: 0.0678871
[298]	valid_0's l1: 0.0678866
[299]	valid_0's l1: 0.0678863
[300]	valid_0's l1: 0.0678855
[301]	valid_0's l1: 0.0678842
[302]	valid_0's l1: 0.0678832
[303]	valid_0's l1: 0.0678825
[304]	valid_0's l1: 0.0678819
[305]	valid_0's l1: 0.0678813
[306]	valid_0's l1: 0.0678804
[307]	valid_0's l1: 0.0678798
[308]	valid_0's l1: 0.0678791
[309]	valid_0's l1: 0.0678784
[310]	valid_0's l1: 0.0678776
[311]	valid_0's l1: 0.0678769
[312]	valid_0's l1: 0.0678758
[313]	valid_0's l1: 0.0678749
[314]	valid_0

In [9]:
from sklearn.metrics import mean_absolute_error
def get_score(x, y, clf, ii):
 return mean_absolute_error(y, clf.predict(x, num_iteration=ii))
lab = []
val_gbdt = []
val_part = []
ii = int(0.7 * num_round)
while ii <= num_round:
 lab.append(ii)
 val_gbdt.append(get_score(xv, yv, clf_gbdt, ii))
 val_part.append(get_score(xv, yv, clf_part, ii))
 ii += 5

In [10]:
print(f'GBDT: {np.array(val_gbdt).min()}')
print(f'PART: {np.array(val_part).min()}')

GBDT: 0.06612165068883384
PART: 0.06612067704950389
