In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn
from sklearn import preprocessing
import time
%matplotlib inline

In [2]:
train = pd.read_csv('input/otto_train.csv')
print(train.shape)

(61878, 95)


In [3]:
def encode_features(dat):
 df = pd.DataFrame(index=dat.index.values)
 for c in dat.columns.values:
 unq = np.unique(dat[c])
 arr = np.zeros(len(df))
 for ii, u in enumerate(unq):
 flg = (dat[c] == u).values
 arr[flg] = ii
 df[c] = arr.astype(int)
 return df

In [4]:
x = encode_features(train.drop(['id', 'target'], axis=1))
y = np.array([int(v.split('_')[1])-1 for v in train.target])
print(x.shape, y.shape)

(61878, 93) (61878,)


In [5]:
num_cls = len(np.unique(y))
print(num_cls)

9


In [6]:
prm_xgb = {
 'booster': 'gbtree',
 'objective': 'multi:softprob',
 'num_class': num_cls,
 'max_depth': 5,
 'learning_rate': 0.1,
 'colsample_bytree': 0.9,
 'subsample': 0.9,
 'eval_metric': 'mlogloss',
}
prm_lgb = {
 'boosting_type': 'gbdt',
 'objective': 'multiclass',
 'num_class': num_cls,
 'num_leaves' : 2**5-1,
 'learning_rate': 0.1,
 'feature_fraction': 0.9,
 'bagging_fraction': 0.9,
 'bagging_freq' : 1,
 'metric': 'multi_logloss',
}
num_round = 100

In [7]:
np.random.seed(20161218)
flg_train = np.random.choice([False, True], len(y), p=[0.3, 0.7])
flg_valid = np.logical_not(flg_train)

In [8]:
dt_xgb = xgb.DMatrix(x[flg_train], y[flg_train])
dv_xgb = xgb.DMatrix(x[flg_valid], y[flg_valid])
dt_lgb = lgb.Dataset(x[flg_train], y[flg_train])
dv_lgb = lgb.Dataset(x[flg_valid], y[flg_valid], reference=dt_lgb)
dt_lgb_c = lgb.Dataset(x[flg_train], y[flg_train], free_raw_data=False)
dv_lgb_c = lgb.Dataset(x[flg_valid], y[flg_valid], free_raw_data=False,
 reference=dt_lgb)

In [9]:
time_s = time.time()
obj_xgb = xgb.train(
 prm_xgb, dt_xgb, num_round,
 [(dt_xgb, 'train'), (dv_xgb, 'valid')])
time_t = time.time()
print(time_t - time_s)

[0]	train-mlogloss:1.97769	valid-mlogloss:1.98167
[1]	train-mlogloss:1.81571	valid-mlogloss:1.82305
[2]	train-mlogloss:1.68693	valid-mlogloss:1.69735
[3]	train-mlogloss:1.58784	valid-mlogloss:1.60077
[4]	train-mlogloss:1.49629	valid-mlogloss:1.51075
[5]	train-mlogloss:1.41958	valid-mlogloss:1.43592
[6]	train-mlogloss:1.35004	valid-mlogloss:1.368
[7]	train-mlogloss:1.28977	valid-mlogloss:1.30914
[8]	train-mlogloss:1.23794	valid-mlogloss:1.25887
[9]	train-mlogloss:1.18878	valid-mlogloss:1.21078
[10]	train-mlogloss:1.14482	valid-mlogloss:1.16806
[11]	train-mlogloss:1.10467	valid-mlogloss:1.12878
[12]	train-mlogloss:1.06779	valid-mlogloss:1.09323
[13]	train-mlogloss:1.03423	valid-mlogloss:1.06093
[14]	train-mlogloss:1.00338	valid-mlogloss:1.03145
[15]	train-mlogloss:0.975446	valid-mlogloss:1.00447
[16]	train-mlogloss:0.950178	valid-mlogloss:0.980001
[17]	train-mlogloss:0.927154	valid-mlogloss:0.957823
[18]	train-mlogloss:0.904816	valid-mlogloss:0.936438
[19]	train-mlogloss:0.88535	valid-ml

In [10]:
time_s = time.time()
obj_lgb = lgb.train(
 prm_lgb, dt_lgb, num_boost_round=num_round,
 valid_sets=dv_lgb)
time_t = time.time()
print(time_t - time_s)
obj_lgb.save_model('output/lgb.txt')

[1]	valid_0's multi_logloss:1.96589
[2]	valid_0's multi_logloss:1.80069
[3]	valid_0's multi_logloss:1.67048
[4]	valid_0's multi_logloss:1.56283
[5]	valid_0's multi_logloss:1.47489
[6]	valid_0's multi_logloss:1.39621
[7]	valid_0's multi_logloss:1.32853
[8]	valid_0's multi_logloss:1.26781
[9]	valid_0's multi_logloss:1.21342
[10]	valid_0's multi_logloss:1.16519
[11]	valid_0's multi_logloss:1.12163
[12]	valid_0's multi_logloss:1.08238
[13]	valid_0's multi_logloss:1.0463
[14]	valid_0's multi_logloss:1.01403
[15]	valid_0's multi_logloss:0.984063
[16]	valid_0's multi_logloss:0.956702
[17]	valid_0's multi_logloss:0.931502
[18]	valid_0's multi_logloss:0.909192
[19]	valid_0's multi_logloss:0.887876
[20]	valid_0's multi_logloss:0.868728
[21]	valid_0's multi_logloss:0.850413
[22]	valid_0's multi_logloss:0.833369
[23]	valid_0's multi_logloss:0.817609
[24]	valid_0's multi_logloss:0.802913
[25]	valid_0's multi_logloss:0.789397
[26]	valid_0's multi_logloss:0.776372
[27]	valid_0's multi_logloss:0.76423

In [11]:
time_s = time.time()
obj_lgb = lgb.train(
 prm_lgb, dt_lgb_c, num_boost_round=num_round,
 valid_sets=dv_lgb_c,
 categorical_feature=list(range(len(x.columns.values))))
time_t = time.time()
print(time_t - time_s)
obj_lgb.save_model('output/lgb_cat.txt')

[1]	valid_0's multi_logloss:2.01067
[2]	valid_0's multi_logloss:1.87489
[3]	valid_0's multi_logloss:1.76438
[4]	valid_0's multi_logloss:1.67353
[5]	valid_0's multi_logloss:1.59879
[6]	valid_0's multi_logloss:1.53027
[7]	valid_0's multi_logloss:1.47235
[8]	valid_0's multi_logloss:1.41975
[9]	valid_0's multi_logloss:1.37189
[10]	valid_0's multi_logloss:1.32934
[11]	valid_0's multi_logloss:1.29121
[12]	valid_0's multi_logloss:1.25617
[13]	valid_0's multi_logloss:1.22375
[14]	valid_0's multi_logloss:1.19445
[15]	valid_0's multi_logloss:1.1664
[16]	valid_0's multi_logloss:1.14143
[17]	valid_0's multi_logloss:1.11888
[18]	valid_0's multi_logloss:1.09807
[19]	valid_0's multi_logloss:1.07792
[20]	valid_0's multi_logloss:1.05924
[21]	valid_0's multi_logloss:1.04119
[22]	valid_0's multi_logloss:1.02435
[23]	valid_0's multi_logloss:1.00902
[24]	valid_0's multi_logloss:0.994402
[25]	valid_0's multi_logloss:0.981219
[26]	valid_0's multi_logloss:0.967938
[27]	valid_0's multi_logloss:0.955806
[28]	va