## Hyperparameter search

In [22]:
#Import H2O and other libraries that will be used in this tutorial 
import h2o
import matplotlib as plt

#Import the Estimators
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator

#Import h2o grid search 
import h2o.grid 
from h2o.grid.grid_search import H2OGridSearch

In [23]:
import h2o
h2o.init(max_mem_size=16)

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,21 hours 35 mins
H2O cluster timezone:,Etc/UTC
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.2
H2O cluster version age:,1 month and 14 days
H2O cluster name:,H2O_from_python_unknownUser_b8im2o
H2O cluster total nodes:,1
H2O cluster free memory:,2.931 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [24]:
loan_level = h2o.import_file("https://s3.amazonaws.com/data.h2o.ai/DAI-Tutorials/loan_level_500k.csv")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [25]:
train, valid, test = loan_level.split_frame([0.7, 0.15], seed=42)
print("train:%d valid:%d test:%d" % (train.nrows, valid.nrows, test.nrows))
y = "DELINQUENT"
ignore = ["DELINQUENT", "PREPAID", "PREPAYMENT_PENALTY_MORTGAGE_FLAG", "PRODUCT_TYPE"] 
x = list(set(train.names) - set(ignore))

train:350268 valid:74971 test:74898


## Grid Search/ Cartesian Search by default or not specified

In [26]:

glm_grid = h2o.grid.H2OGridSearch (
    H2OGeneralizedLinearEstimator( 
        family = "binomial",
        lambda_search = True),
    
    hyper_params = {
        "alpha": [x*0.01 for x in range(0, 4)],
        "lambda": [x*1e-6 for x in range(0, 4)],
        },
    
    grid_id = "glm_grid_2",
    
)
%time glm_grid.train(x=x, y=y, training_frame=train, validation_frame = valid)

glm Grid Build progress: |████████████████████████████████████████████████| 100%
CPU times: user 755 ms, sys: 55.9 ms, total: 811 ms
Wall time: 35.5 s


## Random Search

In [27]:

glm_grid = h2o.grid.H2OGridSearch (
    H2OGeneralizedLinearEstimator( 
        family = "binomial",
        lambda_search = True),
    
    hyper_params = {
        "alpha": [x*0.01 for x in range(0, 100)],
        "lambda": [x*1e-6 for x in range(0, 1000)],
        },
    
    grid_id = "glm_grid",
    
    search_criteria = {
        "strategy":"RandomDiscrete", 
        "max_models":100,
        "max_runtime_secs":300,
        "seed":42
        }
)
%time glm_grid.train(x=x, y=y, training_frame=train, validation_frame = valid)

glm Grid Build progress: |████████████████████████████████████████████████| 100%
CPU times: user 4.73 s, sys: 504 ms, total: 5.23 s
Wall time: 3min 26s


In [28]:
h2o.ls()

Unnamed: 0,key
0,GLM_model_python_1583377547966_1
1,GLM_model_python_1583377547966_11
2,GLM_model_python_1583377547966_14
3,GLM_model_python_1583377547966_3
4,GLM_model_python_1583377547966_5
5,GLM_model_python_1583377547966_8
6,glm_grid
7,glm_grid_2
8,glm_grid_2_model_1
9,glm_grid_2_model_10


In [29]:
help(h2o.grid.H2OGridSearch)

Help on class H2OGridSearch in module h2o.grid.grid_search:

class H2OGridSearch(H2OGridSearch)
 |  Grid Search of a Hyper-Parameter Space for a Model
 |  
 |  :param model: The type of model to be explored initialized with optional parameters that will be
 |      unchanged across explored models.
 |  :param hyper_params: A dictionary of string parameters (keys) and a list of values to be explored by grid
 |      search (values).
 |  :param str grid_id: The unique id assigned to the resulting grid object. If none is given, an id will
 |      automatically be generated.
 |  :param search_criteria:  The optional dictionary of directives which control the search of the hyperparameter space.
 |      The dictionary can include values for: ``strategy``, ``max_models``, ``max_runtime_secs``, ``stopping_metric``, 
 |      ``stopping_tolerance``, ``stopping_rounds`` and ``seed``. The default strategy, "Cartesian", covers the entire space of 
 |      hyperparameter combinations. If you want to u

In [30]:
glm_grid.get_grid(sort_by='auc',decreasing=True)

                       alpha                   lambda           model_ids  \
0                     [0.87]  [4.9999999999999996E-6]   glm_grid_model_61   
1                      [0.4]                 [1.8E-5]   glm_grid_model_46   
2                     [0.07]                 [3.7E-5]   glm_grid_model_48   
3                     [0.07]                 [5.6E-5]   glm_grid_model_72   
4                     [0.48]                 [2.9E-5]   glm_grid_model_96   
5                     [0.88]                 [3.1E-5]   glm_grid_model_37   
6                     [0.18]   [8.099999999999999E-5]   glm_grid_model_86   
7                     [0.15]  [1.3099999999999999E-4]   glm_grid_model_30   
8                      [0.1]                [1.59E-4]   glm_grid_model_38   
9                     [0.06]                [2.12E-4]   glm_grid_model_78   
10                    [0.53]   [9.499999999999999E-5]   glm_grid_model_28   
11     [0.41000000000000003]                [1.37E-4]   glm_grid_model_67   



In [36]:
glm_grid.models[0]

Model Details
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  glm_grid_model_61


GLM Model: summary


Unnamed: 0,Unnamed: 1,family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
0,,binomial,logit,"Elastic Net (alpha = 0.87, lambda = 5.0E-6 )","nlambda = 100, lambda.max = 0.03808, lambda.min = 5.0E-6, lambda.1...",161,143,7,py_15_sid_9664




ModelMetricsBinomialGLM: glm
** Reported on train data. **

MSE: 0.031344275605674536
RMSE: 0.17704314616972477
LogLoss: 0.12279979897845819
Null degrees of freedom: 350267
Residual degrees of freedom: 350124
Null deviance: 108932.13150368733
Residual deviance: 86025.67997717319
AIC: 86313.67997717319
AUC: 0.8519842670925402
AUCPR: 0.21046685420921254
Gini: 0.7039685341850803

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.13993235618594693: 


Unnamed: 0,Unnamed: 1,FALSE,TRUE,Error,Rate
0,FALSE,323805.0,13802.0,0.0409,(13802.0/337607.0)
1,TRUE,8129.0,4532.0,0.6421,(8129.0/12661.0)
2,Total,331934.0,18334.0,0.0626,(21931.0/350268.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.139932,0.292434,200.0
1,max f2,0.068698,0.389555,264.0
2,max f0point5,0.212665,0.290398,157.0
3,max accuracy,0.981772,0.963851,0.0
4,max precision,0.562303,0.421203,45.0
5,max recall,0.000789,1.0,398.0
6,max specificity,0.981772,0.999997,0.0
7,max absolute_mcc,0.099058,0.269921,234.0
8,max min_per_class_accuracy,0.038519,0.77424,305.0
9,max mean_per_class_accuracy,0.036145,0.775428,309.0



Gains/Lift Table: Avg response rate:  3.61 %, avg score:  3.61 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010001,0.33225,10.724871,10.724871,0.387668,0.46616,0.387668,0.46616,0.107259,0.107259,972.487123,972.487123
1,,2,0.020002,0.241718,7.629032,9.176952,0.275764,0.280435,0.331716,0.373297,0.076297,0.183556,662.903211,817.695167
2,,3,0.030003,0.195284,6.452297,8.268734,0.233229,0.216827,0.298887,0.32114,0.064529,0.248085,545.229734,726.873356
3,,4,0.040001,0.165076,5.166472,7.493334,0.18675,0.179199,0.270859,0.285663,0.051655,0.299739,416.647177,649.333418
4,,5,0.050002,0.143363,4.770119,6.94866,0.172424,0.153562,0.25117,0.259241,0.047706,0.347445,377.011946,594.866013
5,,6,0.100001,0.087455,3.342624,5.145694,0.120825,0.110895,0.185999,0.18507,0.167127,0.514572,234.262433,414.569371
6,,7,0.150002,0.062108,2.397833,4.229723,0.086674,0.073579,0.15289,0.147906,0.119896,0.634468,139.783271,322.972261
7,,8,0.200001,0.047315,1.600226,3.572367,0.057843,0.054179,0.129129,0.124475,0.080009,0.714478,60.022611,257.236725
8,,9,0.300002,0.030452,1.121548,2.755427,0.04054,0.037965,0.099599,0.095638,0.112155,0.826633,12.154798,175.542749
9,,10,0.399999,0.020842,0.645303,2.227908,0.023326,0.025266,0.080531,0.078045,0.064529,0.891162,-35.469658,122.790777




ModelMetricsBinomialGLM: glm
** Reported on validation data. **

MSE: 0.031018805729749764
RMSE: 0.17612156520355413
LogLoss: 0.12242815235268398
Null degrees of freedom: 74970
Residual degrees of freedom: 74827
Null deviance: 22974.597464481732
Residual deviance: 18357.12202006614
AIC: 18645.12202006614
AUC: 0.8460502420206815
AUCPR: 0.2009137545141779
Gini: 0.6921004840413629

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.13091994899463488: 


Unnamed: 0,Unnamed: 1,FALSE,TRUE,Error,Rate
0,FALSE,69013.0,3300.0,0.0456,(3300.0/72313.0)
1,TRUE,1672.0,986.0,0.629,(1672.0/2658.0)
2,Total,70685.0,4286.0,0.0663,(4972.0/74971.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.13092,0.283986,201.0
1,max f2,0.071763,0.375903,256.0
2,max f0point5,0.242376,0.295791,133.0
3,max accuracy,0.974097,0.964533,0.0
4,max precision,0.376628,0.396364,83.0
5,max recall,0.000735,1.0,398.0
6,max specificity,0.974097,0.999986,0.0
7,max absolute_mcc,0.119399,0.260856,210.0
8,max min_per_class_accuracy,0.03762,0.764108,304.0
9,max mean_per_class_accuracy,0.034897,0.767316,309.0



Gains/Lift Table: Avg response rate:  3.55 %, avg score:  3.61 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010004,0.334125,10.793417,10.793417,0.382667,0.473002,0.382667,0.473002,0.107976,0.107976,979.341711,979.341711
1,,2,0.020008,0.243096,8.348915,9.571166,0.296,0.282541,0.339333,0.377771,0.083521,0.191497,734.891497,857.116604
2,,3,0.030012,0.193143,5.829197,8.323843,0.206667,0.216035,0.295111,0.323859,0.058315,0.249812,482.919739,732.384316
3,,4,0.040002,0.162847,4.895532,7.467623,0.173565,0.176602,0.264755,0.287082,0.048909,0.298721,389.553164,646.762264
4,,5,0.050006,0.141925,3.948811,6.763673,0.14,0.152013,0.239797,0.260061,0.039503,0.338224,294.881114,576.367262
5,,6,0.100012,0.087175,3.310363,5.037018,0.117365,0.110232,0.178581,0.185146,0.165538,0.503762,231.036257,403.70176
6,,7,0.150005,0.0618,2.227565,4.1007,0.078975,0.073379,0.145385,0.147897,0.111362,0.615124,122.756536,310.070006
7,,8,0.200011,0.047553,1.707846,3.502447,0.060549,0.054259,0.124175,0.124486,0.085403,0.700527,70.784615,250.244669
8,,9,0.300009,0.030349,1.10611,2.703703,0.039216,0.03807,0.095856,0.095682,0.110609,0.811136,10.610956,170.370316
9,,10,0.400008,0.020669,0.714833,2.206502,0.025343,0.025116,0.078229,0.078041,0.071482,0.882619,-28.516729,120.650213




Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,iteration,lambda,predictors,deviance_train,deviance_test
0,,2020-03-06 00:44:12,0.000 sec,7,5e-06,144,0.2456,0.244856




In [32]:
glm_grid.summary()


Grid Summary:



0,1,2,3,4,5,6,7,8
Model Id,family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
glm_grid_model_61,binomial,logit,"Elastic Net (alpha = 0.87, lambda = 5.0E-6 )","nlambda = 100, lambda.max = 0.03808, lambda.min = 5.0E-6, lambda.1se = -1.0",161,143,7,py_15_sid_9664
glm_grid_model_46,binomial,logit,"Elastic Net (alpha = 0.4, lambda = 1.8E-5 )","nlambda = 100, lambda.max = 0.08282, lambda.min = 1.8E-5, lambda.1se = -1.0",161,137,7,py_15_sid_9664
glm_grid_model_48,binomial,logit,"Elastic Net (alpha = 0.07, lambda = 3.7E-5 )","nlambda = 100, lambda.max = 0.4733, lambda.min = 3.7E-5, lambda.1se = -1.0",161,145,5,py_15_sid_9664
glm_grid_model_96,binomial,logit,"Elastic Net (alpha = 0.48, lambda = 2.9E-5 )","nlambda = 100, lambda.max = 0.06902, lambda.min = 2.9E-5, lambda.1se = -1.0",161,110,7,py_15_sid_9664
glm_grid_model_72,binomial,logit,"Elastic Net (alpha = 0.07, lambda = 5.6E-5 )","nlambda = 100, lambda.max = 0.4733, lambda.min = 5.6E-5, lambda.1se = -1.0",161,137,5,py_15_sid_9664
glm_grid_model_37,binomial,logit,"Elastic Net (alpha = 0.88, lambda = 3.1E-5 )","nlambda = 100, lambda.max = 0.03765, lambda.min = 3.1E-5, lambda.1se = -1.0",161,93,7,py_15_sid_9664
glm_grid_model_86,binomial,logit,"Elastic Net (alpha = 0.18, lambda = 8.1E-5 )","nlambda = 100, lambda.max = 0.184, lambda.min = 8.1E-5, lambda.1se = -1.0",161,108,7,py_15_sid_9664
glm_grid_model_30,binomial,logit,"Elastic Net (alpha = 0.15, lambda = 1.31E-4 )","nlambda = 100, lambda.max = 0.2209, lambda.min = 1.31E-4, lambda.1se = -1.0",161,105,7,py_15_sid_9664
glm_grid_model_38,binomial,logit,"Elastic Net (alpha = 0.1, lambda = 1.59E-4 )","nlambda = 100, lambda.max = 0.3313, lambda.min = 1.59E-4, lambda.1se = -1.0",161,110,7,py_15_sid_9664


In [37]:
sorted_glm_grid = glm_grid.get_grid(sort_by='auc',decreasing=True)

In [38]:
sorted_glm_grid[0].actual_params

{'model_id': 'glm_grid_model_61',
 'training_frame': 'py_15_sid_9664',
 'validation_frame': 'py_16_sid_9664',
 'nfolds': 0,
 'seed': 202,
 'keep_cross_validation_models': True,
 'keep_cross_validation_predictions': False,
 'keep_cross_validation_fold_assignment': False,
 'fold_assignment': 'AUTO',
 'fold_column': None,
 'response_column': 'DELINQUENT',
 'ignored_columns': ['PRODUCT_TYPE',
  'PREPAYMENT_PENALTY_MORTGAGE_FLAG',
  'PREPAID'],
 'random_columns': None,
 'ignore_const_cols': True,
 'score_each_iteration': False,
 'offset_column': None,
 'weights_column': None,
 'family': 'binomial',
 'rand_family': None,
 'tweedie_variance_power': 0.0,
 'tweedie_link_power': 1.0,
 'theta': 1e-10,
 'solver': 'COORDINATE_DESCENT',
 'alpha': [0.87],
 'lambda': [4.9999999999999996e-06],
 'lambda_search': True,
 'early_stopping': True,
 'nlambdas': 100,
 'standardize': True,
 'missing_values_handling': 'MeanImputation',
 'plug_values': None,
 'compute_p_values': False,
 'remove_collinear_columns'

In [39]:
print(sorted_glm_grid[0].F1())
sorted_glm_grid[1].F1()

[[0.13993235618594693, 0.29243426359090174]]


[[0.1388919375261923, 0.29188911043931304]]

In [40]:
sorted_glm_grid[0].model_performance(test) # should give AUC of 0.8524 compared to the untuned version of 0.8523


ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.031143376101575086
RMSE: 0.17647485968708146
LogLoss: 0.12199693111453563
Null degrees of freedom: 74897
Residual degrees of freedom: 74754
Null deviance: 23061.156287645877
Residual deviance: 18274.652293232975
AIC: 18562.652293232975
AUC: 0.8524158062119054
AUCPR: 0.20258611034476104
Gini: 0.7048316124238108

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.13069466877003805: 


Unnamed: 0,Unnamed: 1,FALSE,TRUE,Error,Rate
0,FALSE,68851.0,3375.0,0.0467,(3375.0/72226.0)
1,TRUE,1676.0,996.0,0.6272,(1676.0/2672.0)
2,Total,70527.0,4371.0,0.0674,(5051.0/74898.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.130695,0.282834,202.0
1,max f2,0.064974,0.386713,264.0
2,max f0point5,0.206442,0.283731,151.0
3,max accuracy,0.939516,0.964311,0.0
4,max precision,0.602416,0.438596,32.0
5,max recall,0.000945,1.0,398.0
6,max specificity,0.939516,0.999986,0.0
7,max absolute_mcc,0.072018,0.263347,255.0
8,max min_per_class_accuracy,0.038326,0.773336,305.0
9,max mean_per_class_accuracy,0.032702,0.777303,315.0



Gains/Lift Table: Avg response rate:  3.57 %, avg score:  3.60 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.01,0.323153,9.954824,9.954824,0.35514,0.464324,0.35514,0.464324,0.099551,0.099551,895.4824,895.4824
1,,2,0.020001,0.238585,7.859072,8.906948,0.280374,0.275583,0.317757,0.369954,0.078593,0.178144,685.907158,790.694779
2,,3,0.030001,0.194006,6.362106,8.058667,0.226969,0.214539,0.287494,0.318149,0.063623,0.241766,536.210556,705.866705
3,,4,0.040001,0.164906,4.939988,7.278997,0.176235,0.178489,0.25968,0.283234,0.049401,0.291168,393.998785,627.899725
4,,5,0.050001,0.143184,4.378626,6.698923,0.156208,0.153711,0.238985,0.257329,0.043787,0.334955,337.862559,569.892292
5,,6,0.100003,0.087476,3.428052,5.063488,0.122296,0.111145,0.180641,0.184237,0.171407,0.506362,242.805217,406.348754
6,,7,0.150004,0.062147,2.492448,4.206475,0.088919,0.073591,0.150067,0.147355,0.124626,0.630988,149.244841,320.64745
7,,8,0.200005,0.047494,1.639178,3.56465,0.058478,0.054272,0.12717,0.124084,0.081961,0.712949,63.917779,256.465032
8,,9,0.300008,0.0303,1.205058,2.778119,0.042991,0.037927,0.09911,0.095365,0.120509,0.833458,20.505764,177.811943
9,,10,0.399997,0.020667,0.591381,2.231489,0.021098,0.025088,0.079609,0.077797,0.059132,0.89259,-40.861947,123.148945







## Random Forest

In [41]:
# Grid Search/ Cartesian Search by default or not specified
rf_grid = h2o.grid.H2OGridSearch (
    H2ORandomForestEstimator(nfolds=10),
    
    hyper_params = {
        "ntrees": [50,100],
        "max_depth": [10,20],
        },
    
     search_criteria = {
        "strategy":"RandomDiscrete", # Random Search 
        "max_models":100,
        "max_runtime_secs":300,
        "seed":42
        },
    
    grid_id = "rf_grid_2",
    
)
%time rf_grid.train(x=x, y=y, training_frame=train, validation_frame = valid)

drf Grid Build progress: |████████████████████████████████████████████████| 100%
CPU times: user 2.33 s, sys: 432 ms, total: 2.76 s
Wall time: 6min 45s


In [42]:
rf_grid.get_grid(sort_by='auc', decreasing=True)

    max_depth ntrees          model_ids                auc
0          20     28  rf_grid_2_model_1  0.818864830103598




### Get the best model and train on top of that

In [43]:
best_model = rf_grid.get_grid(sort_by="auc", decreasing=True)[0]

rf = H2ORandomForestEstimator (seed=42, model_id='default_random_forest', checkpoint=best_model.model_id)
%time rf.train(x=x, y=y, training_frame=train, validation_frame=valid)

drf Model Build progress: |███████████████████████████████████████████████| 100%
CPU times: user 347 ms, sys: 109 ms, total: 456 ms
Wall time: 39.7 s


In [45]:
rf.summary()


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,50.0,50.0,7675073.0,20.0,20.0,8.8,9881.0,11724.0,4780.06


