{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# This is a demo of H2O's GLM function\n", "# It imports a data set, parses it, and prints a summary\n", "# Then, it runs GLM with a binomial link function\n", "import h2o\n", "from h2o.estimators.random_forest import H2ORandomForestEstimator" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Warning: Version mismatch. H2O is version 3.5.0.99999, but the python package is version UNKNOWN.\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
H2O cluster uptime: 44 minutes 50 seconds 74 milliseconds
H2O cluster version: 3.5.0.99999
H2O cluster name: ludirehak
H2O cluster total nodes: 1
H2O cluster total memory: 3.56 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: True
H2O Connection ip: 127.0.0.1
H2O Connection port: 54321
" ], "text/plain": [ "-------------------------- -------------------------------------\n", "H2O cluster uptime: 44 minutes 50 seconds 74 milliseconds\n", "H2O cluster version: 3.5.0.99999\n", "H2O cluster name: ludirehak\n", "H2O cluster total nodes: 1\n", "H2O cluster total memory: 3.56 GB\n", "H2O cluster total cores: 8\n", "H2O cluster allowed cores: 8\n", "H2O cluster healthy: True\n", "H2O Connection ip: 127.0.0.1\n", "H2O Connection port: 54321\n", "-------------------------- -------------------------------------" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "h2o.init()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Parse Progress: [##################################################] 100%\n", "Uploaded pya01a74e5-0aa6-4ef0-ae1a-0d3fe860eee9 into cluster with 24,421 rows and 12 cols\n" ] } ], "source": [ "from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.\n", "\n", "air = h2o.upload_file(path=_locate(\"smalldata/airlines/AirlinesTrain.csv.zip\"))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "r = air[0].runif()\n", "air_train = air[r < 0.8]\n", "air_valid = air[r >= 0.8]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "myX = [\"Origin\", \"Dest\", \"Distance\", \"UniqueCarrier\", \"fMonth\", \"fDayofMonth\", \"fDayOfWeek\"]\n", "myY = \"IsDepDelayed\"" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "drf Model Build Progress: [##################################################] 100%\n", "Model Details\n", "=============\n", "H2ORandomForestEstimator : Distributed RF\n", "Model Key: DRF_model_python_1445557087082_2742\n", "\n", "Model Summary:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
number_of_treesmodel_size_in_bytesmin_depthmax_depthmean_depthmin_leavesmax_leavesmean_leaves
10.0287650.020.020.020.01664.02418.02103.5
" ], "text/plain": [ " number_of_trees model_size_in_bytes min_depth max_depth mean_depth min_leaves max_leaves mean_leaves\n", "-- ----------------- --------------------- ----------- ----------- ------------ ------------ ------------ -------------\n", " 10 287650 20 20 20 1664 2418 2103.5" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "ModelMetricsBinomial: drf\n", "** Reported on train data. **\n", "\n", "MSE: 0.269503006052\n", "R^2: -0.0873991649123\n", "LogLoss: 2.43382549553\n", "AUC: 0.646622642412\n", "Gini: 0.293245284825\n", "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.402941766395:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
NOYESErrorRate
NO1948.06780.00.7768 (6780.0/8728.0)
YES936.09580.00.089 (936.0/10516.0)
Total2884.016360.00.401 (7716.0/19244.0)
" ], "text/plain": [ " NO YES Error Rate\n", "----- ---- ----- ------- ----------------\n", "NO 1948 6780 0.7768 (6780.0/8728.0)\n", "YES 936 9580 0.089 (936.0/10516.0)\n", "Total 2884 16360 0.401 (7716.0/19244.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Maximum Metrics: Maximum metrics at their respective thresholds\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
metricthresholdvalueidx
max f10.40.7299.0
max f20.00.9399.0
max f0point50.60.7190.0
max accuracy0.60.6193.0
max precision0.90.730.0
max absolute_MCC0.60.2190.0
max min_per_class_accuracy0.70.6140.0
" ], "text/plain": [ "metric threshold value idx\n", "-------------------------- ----------- -------- -----\n", "max f1 0.402942 0.712904 299\n", "max f2 0 0.857637 399\n", "max f0point5 0.649503 0.653173 190\n", "max accuracy 0.643019 0.624662 193\n", "max precision 0.938886 0.697052 30\n", "max absolute_MCC 0.649503 0.234208 190\n", "max min_per_class_accuracy 0.737524 0.611449 140" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "ModelMetricsBinomial: drf\n", "** Reported on validation data. **\n", "\n", "MSE: 0.245293478794\n", "R^2: 0.00968032826017\n", "LogLoss: 0.758757679035\n", "AUC: 0.685987609758\n", "Gini: 0.371975219515\n", "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.42132409513:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
NOYESErrorRate
NO467.01781.00.7923 (1781.0/2248.0)
YES160.02566.00.0587 (160.0/2726.0)
Total627.04347.00.3902 (1941.0/4974.0)
" ], "text/plain": [ " NO YES Error Rate\n", "----- ---- ----- ------- ---------------\n", "NO 467 1781 0.7923 (1781.0/2248.0)\n", "YES 160 2566 0.0587 (160.0/2726.0)\n", "Total 627 4347 0.3902 (1941.0/4974.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Maximum Metrics: Maximum metrics at their respective thresholds\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
metricthresholdvalueidx
max f10.40.7315.0
max f20.20.9396.0
max f0point50.70.7174.0
max accuracy0.70.6200.0
max precision1.00.90.0
max absolute_MCC0.70.3174.0
max min_per_class_accuracy0.70.6165.0
" ], "text/plain": [ "metric threshold value idx\n", "-------------------------- ----------- -------- -----\n", "max f1 0.421324 0.725576 315\n", "max f2 0.150007 0.858637 396\n", "max f0point5 0.711122 0.674532 174\n", "max accuracy 0.668563 0.645758 200\n", "max precision 1 0.907895 0\n", "max absolute_MCC 0.711122 0.279686 174\n", "max min_per_class_accuracy 0.726159 0.63573 165" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Scoring History:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
timestampdurationnumber_of_treestraining_MSEtraining_loglosstraining_AUCtraining_classification_errorvalidation_MSEvalidation_loglossvalidation_AUCvalidation_classification_error
2015-10-22 17:22:58 0.074 sec1.00.38.40.60.40.38.10.60.5
2015-10-22 17:22:58 0.163 sec2.00.37.40.60.40.34.00.60.4
2015-10-22 17:22:58 0.245 sec3.00.36.50.60.40.32.60.60.4
2015-10-22 17:22:58 0.311 sec4.00.35.60.60.50.31.90.70.4
2015-10-22 17:22:58 0.391 sec5.00.34.80.60.40.31.40.70.4
2015-10-22 17:22:58 0.480 sec6.00.34.00.60.40.31.10.70.4
2015-10-22 17:22:58 0.565 sec7.00.33.60.60.40.21.00.70.4
2015-10-22 17:22:58 0.659 sec8.00.33.10.60.40.20.90.70.4
2015-10-22 17:22:58 0.751 sec9.00.32.70.60.40.20.80.70.4
2015-10-22 17:22:58 0.851 sec10.00.32.40.60.40.20.80.70.4
" ], "text/plain": [ " timestamp duration number_of_trees training_MSE training_logloss training_AUC training_classification_error validation_MSE validation_logloss validation_AUC validation_classification_error\n", "-- ------------------- ---------- ----------------- -------------- ------------------ -------------- ------------------------------- ---------------- -------------------- ---------------- ---------------------------------\n", " 2015-10-22 17:22:58 0.074 sec 1 0.336005 8.38071 0.593241 0.447603 0.328738 8.12159 0.601799 0.45195\n", " 2015-10-22 17:22:58 0.163 sec 2 0.324066 7.38312 0.593177 0.448151 0.284953 3.9759 0.623903 0.400483\n", " 2015-10-22 17:22:58 0.245 sec 3 0.313138 6.50304 0.604313 0.449285 0.271995 2.58926 0.637956 0.414154\n", " 2015-10-22 17:22:58 0.311 sec 4 0.303166 5.59741 0.612553 0.4509 0.262972 1.87702 0.651933 0.415963\n", " 2015-10-22 17:22:58 0.391 sec 5 0.293768 4.79832 0.621609 0.423103 0.256723 1.38846 0.662515 0.392843\n", " 2015-10-22 17:22:58 0.480 sec 6 0.285971 4.01094 0.629809 0.412818 0.251944 1.11198 0.67238 0.388219\n", " 2015-10-22 17:22:58 0.565 sec 7 0.28125 3.55467 0.636323 0.399323 0.249657 0.991554 0.67852 0.382589\n", " 2015-10-22 17:22:58 0.659 sec 8 0.277031 3.11835 0.639363 0.399567 0.246953 0.880982 0.682305 0.386811\n", " 2015-10-22 17:22:58 0.751 sec 9 0.271668 2.73216 0.645331 0.39977 0.245542 0.831393 0.68429 0.384801\n", " 2015-10-22 17:22:58 0.851 sec 10 0.269503 2.43383 0.646623 0.400956 0.245293 0.758758 0.685988 0.390229" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Variable Importances:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
variablerelative_importancescaled_importancepercentage
Origin6152.21.00.3
fDayofMonth5583.60.90.3
Dest4203.40.70.2
UniqueCarrier1609.30.30.1
fDayOfWeek1556.20.30.1
Distance1493.00.20.1
fMonth131.70.00.0
" ], "text/plain": [ "variable relative_importance scaled_importance percentage\n", "------------- --------------------- ------------------- ------------\n", "Origin 6152.21 1 0.296788\n", "fDayofMonth 5583.59 0.907575 0.269357\n", "Dest 4203.39 0.683233 0.202775\n", "UniqueCarrier 1609.28 0.261578 0.077633\n", "fDayOfWeek 1556.19 0.252948 0.0750719\n", "Distance 1492.99 0.242675 0.072023\n", "fMonth 131.683 0.0214043 0.00635252" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "rf_no_bal = H2ORandomForestEstimator(seed=12, ntrees=10, max_depth=20, balance_classes=False)\n", "rf_no_bal.train(x=myX, y=myY, training_frame=air_train, validation_frame=air_valid)\n", "rf_no_bal.show()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "drf Model Build Progress: [##################################################] 100%\n", "Model Details\n", "=============\n", "H2ORandomForestEstimator : Distributed RF\n", "Model Key: DRF_model_python_1445557087082_2744\n", "\n", "Model Summary:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
number_of_treesmodel_size_in_bytesmin_depthmax_depthmean_depthmin_leavesmax_leavesmean_leaves
10.0299144.020.020.020.01750.02460.02168.2
" ], "text/plain": [ " number_of_trees model_size_in_bytes min_depth max_depth mean_depth min_leaves max_leaves mean_leaves\n", "-- ----------------- --------------------- ----------- ----------- ------------ ------------ ------------ -------------\n", " 10 299144 20 20 20 1750 2460 2168.2" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "ModelMetricsBinomial: drf\n", "** Reported on train data. **\n", "\n", "MSE: 0.268874582249\n", "R^2: -0.0754992978501\n", "LogLoss: 2.09200342169\n", "AUC: 0.685292136376\n", "Gini: 0.370584272753\n", "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.538182890839:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
NOYESErrorRate
NO3925.06621.00.6278 (6621.0/10546.0)
YES1574.08952.00.1495 (1574.0/10526.0)
Total5499.015573.00.3889 (8195.0/21072.0)
" ], "text/plain": [ " NO YES Error Rate\n", "----- ---- ----- ------- ----------------\n", "NO 3925 6621 0.6278 (6621.0/10546.0)\n", "YES 1574 8952 0.1495 (1574.0/10526.0)\n", "Total 5499 15573 0.3889 (8195.0/21072.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Maximum Metrics: Maximum metrics at their respective thresholds\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
metricthresholdvalueidx
max f10.50.7226.0
max f20.00.8399.0
max f0point50.80.6124.0
max accuracy0.70.6140.0
max precision0.90.728.0
max absolute_MCC0.70.3151.0
max min_per_class_accuracy0.70.6140.0
" ], "text/plain": [ "metric threshold value idx\n", "-------------------------- ----------- -------- -----\n", "max f1 0.538183 0.686003 226\n", "max f2 0 0.83307 399\n", "max f0point5 0.761166 0.646627 124\n", "max accuracy 0.731244 0.645976 140\n", "max precision 0.939299 0.70679 28\n", "max absolute_MCC 0.707454 0.292387 151\n", "max min_per_class_accuracy 0.731244 0.645069 140" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "ModelMetricsBinomial: drf\n", "** Reported on validation data. **\n", "\n", "MSE: 0.249809873778\n", "R^2: -0.00855364526058\n", "LogLoss: 0.770654128805\n", "AUC: 0.682375448104\n", "Gini: 0.364750896207\n", "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.56328826827:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
NOYESErrorRate
NO822.01426.00.6343 (1426.0/2248.0)
YES367.02359.00.1346 (367.0/2726.0)
Total1189.03785.00.3605 (1793.0/4974.0)
" ], "text/plain": [ " NO YES Error Rate\n", "----- ---- ----- ------- ---------------\n", "NO 822 1426 0.6343 (1426.0/2248.0)\n", "YES 367 2359 0.1346 (367.0/2726.0)\n", "Total 1189 3785 0.3605 (1793.0/4974.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Maximum Metrics: Maximum metrics at their respective thresholds\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
metricthresholdvalueidx
max f10.60.7261.0
max f20.10.9399.0
max f0point50.70.7179.0
max accuracy0.60.6235.0
max precision1.00.86.0
max absolute_MCC0.70.3194.0
max min_per_class_accuracy0.70.6167.0
" ], "text/plain": [ "metric threshold value idx\n", "-------------------------- ----------- -------- -----\n", "max f1 0.563288 0.72462 261\n", "max f2 0.119724 0.85842 399\n", "max f0point5 0.725001 0.671988 179\n", "max accuracy 0.616361 0.644954 235\n", "max precision 0.984071 0.844037 6\n", "max absolute_MCC 0.694824 0.275787 194\n", "max min_per_class_accuracy 0.743618 0.632117 167" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Scoring History:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
timestampdurationnumber_of_treestraining_MSEtraining_loglosstraining_AUCtraining_classification_errorvalidation_MSEvalidation_loglossvalidation_AUCvalidation_classification_error
2015-10-22 17:22:59 0.093 sec1.00.37.30.60.40.37.90.60.5
2015-10-22 17:22:59 0.152 sec2.00.36.80.60.40.33.70.60.4
2015-10-22 17:22:59 0.210 sec3.00.35.90.60.40.32.20.60.4
2015-10-22 17:22:59 0.287 sec4.00.35.20.60.40.31.60.70.4
2015-10-22 17:22:59 0.377 sec5.00.34.30.70.40.31.30.70.4
2015-10-22 17:22:59 0.469 sec6.00.33.70.70.40.31.00.70.4
2015-10-22 17:22:59 0.571 sec7.00.33.20.70.40.30.90.70.4
2015-10-22 17:22:59 0.678 sec8.00.32.80.70.40.30.90.70.4
2015-10-22 17:22:59 0.784 sec9.00.32.40.70.40.20.80.70.4
2015-10-22 17:22:59 0.894 sec10.00.32.10.70.40.20.80.70.4
" ], "text/plain": [ " timestamp duration number_of_trees training_MSE training_logloss training_AUC training_classification_error validation_MSE validation_logloss validation_AUC validation_classification_error\n", "-- ------------------- ---------- ----------------- -------------- ------------------ -------------- ------------------------------- ---------------- -------------------- ---------------- ---------------------------------\n", " 2015-10-22 17:22:59 0.093 sec 1 0.316234 7.2617 0.637354 0.391226 0.329137 7.87255 0.591265 0.45195\n", " 2015-10-22 17:22:59 0.152 sec 2 0.31632 6.78219 0.637055 0.404918 0.287988 3.72898 0.623305 0.397467\n", " 2015-10-22 17:22:59 0.210 sec 3 0.308719 5.88527 0.641048 0.41827 0.26868 2.16698 0.648628 0.405308\n", " 2015-10-22 17:22:59 0.287 sec 4 0.305607 5.19153 0.641915 0.421194 0.263474 1.63783 0.656156 0.40953\n", " 2015-10-22 17:22:59 0.377 sec 5 0.293851 4.31934 0.655025 0.410047 0.258099 1.26669 0.664182 0.385203\n", " 2015-10-22 17:22:59 0.469 sec 6 0.286361 3.66732 0.662667 0.412603 0.254859 0.98441 0.670485 0.385806\n", " 2015-10-22 17:22:59 0.571 sec 7 0.281092 3.23939 0.670316 0.406857 0.253256 0.94761 0.675664 0.382589\n", " 2015-10-22 17:22:59 0.678 sec 8 0.276215 2.80048 0.676027 0.408681 0.252291 0.878195 0.678797 0.38842\n", " 2015-10-22 17:22:59 0.784 sec 9 0.271224 2.40468 0.681395 0.390547 0.249776 0.825156 0.681276 0.372336\n", " 2015-10-22 17:22:59 0.894 sec 10 0.268875 2.092 0.685292 0.388905 0.24981 0.770654 0.682375 0.360474" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Variable Importances:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
variablerelative_importancescaled_importancepercentage
Origin6811.11.00.3
fDayofMonth6129.00.90.3
Dest4860.00.70.2
UniqueCarrier1824.50.30.1
fDayOfWeek1634.10.20.1
Distance1591.50.20.1
fMonth129.60.00.0
" ], "text/plain": [ "variable relative_importance scaled_importance percentage\n", "------------- --------------------- ------------------- ------------\n", "Origin 6811.08 1 0.296394\n", "fDayofMonth 6128.96 0.899851 0.266711\n", "Dest 4860.05 0.71355 0.211492\n", "UniqueCarrier 1824.52 0.267874 0.0793964\n", "fDayOfWeek 1634.07 0.239913 0.0711088\n", "Distance 1591.55 0.23367 0.0692584\n", "fMonth 129.597 0.0190274 0.00563962" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "rf_bal = H2ORandomForestEstimator(seed=12, ntrees=10, max_depth=20, balance_classes=True)\n", "rf_bal.train(x=myX, y=myY, training_frame=air_train, validation_frame=air_valid)\n", "rf_bal.show()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Parse Progress: [##################################################] 100%\n", "Imported /Users/ludirehak/h2o-3/smalldata/airlines/AirlinesTest.csv.zip. Parsed 2,691 rows and 12 cols\n" ] } ], "source": [ "air_test = h2o.import_file(path=_locate(\"smalldata/airlines/AirlinesTest.csv.zip\"))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def model(model_object, test):\n", " #predicting on test file\n", " pred = model_object.predict(test)\n", " pred.head()\n", " #Building confusion matrix for test set\n", " perf = model_object.model_performance(test)\n", " perf.show()\n", " print(perf.confusion_matrix())\n", " print(perf.precision())\n", " print(perf.accuracy())\n", " print(perf.auc())" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "WITHOUT CLASS BALANCING\n", "\n", "H2OFrame with 2691 rows and 3 columns: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
predictYESYESYESYESYESYESNOYESYESYES
NO0.10.00.2250.1750.50.40.60.30.30.4
YES0.91.00.7750.8250.50.60.40.70.70.6
" ], "text/plain": [ "predict YES YES YES YES YES YES NO YES YES YES\n", "--------- ----- --------- ----- ----- -------- -------- -------- -------- -------- -------\n", "NO 0.14 0.0242857 0.225 0.175 0.453293 0.388391 0.598466 0.271046 0.307406 0.42179\n", "YES 0.86 0.975714 0.775 0.825 0.546707 0.611609 0.401534 0.728954 0.692594 0.57821" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "ModelMetricsBinomial: drf\n", "** Reported on test data. **\n", "\n", "MSE: 0.242134967995\n", "R^2: 0.0225448334417\n", "LogLoss: 0.818660036508\n", "AUC: 0.705312795104\n", "Gini: 0.410625590208\n", "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.51742125228:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
NOYESErrorRate
NO377.0840.00.6902 (840.0/1217.0)
YES143.01331.00.097 (143.0/1474.0)
Total520.02171.00.3653 (983.0/2691.0)
" ], "text/plain": [ " NO YES Error Rate\n", "----- ---- ----- ------- --------------\n", "NO 377 840 0.6902 (840.0/1217.0)\n", "YES 143 1331 0.097 (143.0/1474.0)\n", "Total 520 2171 0.3653 (983.0/2691.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Maximum Metrics: Maximum metrics at their respective thresholds\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
metricthresholdvalueidx
max f10.50.7276.0
max f20.20.9381.0
max f0point50.70.7174.0
max accuracy0.70.7186.0
max precision1.00.97.0
max absolute_MCC0.70.3174.0
max min_per_class_accuracy0.70.7162.0
" ], "text/plain": [ "metric threshold value idx\n", "-------------------------- ----------- -------- -----\n", "max f1 0.517421 0.730316 276\n", "max f2 0.247669 0.859932 381\n", "max f0point5 0.716854 0.693575 174\n", "max accuracy 0.693919 0.66518 186\n", "max precision 0.98545 0.85567 7\n", "max absolute_MCC 0.716854 0.322241 174\n", "max min_per_class_accuracy 0.737149 0.654003 162" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.51742125228:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
NOYESErrorRate
NO377.0840.00.6902 (840.0/1217.0)
YES143.01331.00.097 (143.0/1474.0)
Total520.02171.00.3653 (983.0/2691.0)
" ], "text/plain": [ " NO YES Error Rate\n", "----- ---- ----- ------- --------------\n", "NO 377 840 0.6902 (840.0/1217.0)\n", "YES 143 1331 0.097 (143.0/1474.0)\n", "Total 520 2171 0.3653 (983.0/2691.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "[[0.985450211376883, 0.8556701030927835]]\n", "[[0.6939187561627477, 0.6651802303976218]]\n", "0.705312795104\n" ] } ], "source": [ "print(\"\\n\\nWITHOUT CLASS BALANCING\\n\")\n", "model(rf_no_bal, air_test)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "WITH CLASS BALANCING\n", "\n", "H2OFrame with 2691 rows and 3 columns: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
predictYESYESYESYESNONONOYESYESNO
NO0.00.30.10.00.40.50.70.10.30.5
YES1.00.70.91.00.60.50.30.90.70.5
" ], "text/plain": [ "predict YES YES YES YES NO NO NO YES YES NO\n", "--------- --------- -------- --------- --------- -------- -------- -------- --------- -------- --------\n", "NO 0.0116536 0.255432 0.0877282 0.0487275 0.447038 0.491235 0.669041 0.0670805 0.338563 0.484582\n", "YES 0.988346 0.744568 0.912272 0.951272 0.552962 0.508765 0.330959 0.932919 0.661437 0.515418" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "ModelMetricsBinomial: drf\n", "** Reported on test data. **\n", "\n", "MSE: 0.24831550935\n", "R^2: -0.00240489657592\n", "LogLoss: 0.758488823047\n", "AUC: 0.693547371085\n", "Gini: 0.38709474217\n", "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.475092852495:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
NOYESErrorRate
NO269.0948.00.779 (948.0/1217.0)
YES85.01389.00.0577 (85.0/1474.0)
Total354.02337.00.3839 (1033.0/2691.0)
" ], "text/plain": [ " NO YES Error Rate\n", "----- ---- ----- ------- ---------------\n", "NO 269 948 0.779 (948.0/1217.0)\n", "YES 85 1389 0.0577 (85.0/1474.0)\n", "Total 354 2337 0.3839 (1033.0/2691.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Maximum Metrics: Maximum metrics at their respective thresholds\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
metricthresholdvalueidx
max f10.50.7307.0
max f20.30.9379.0
max f0point50.70.7184.0
max accuracy0.70.7210.0
max precision1.00.851.0
max absolute_MCC0.70.3210.0
max min_per_class_accuracy0.70.6164.0
" ], "text/plain": [ "metric threshold value idx\n", "-------------------------- ----------- -------- -----\n", "max f1 0.475093 0.728943 307\n", "max f2 0.256539 0.859284 379\n", "max f0point5 0.7144 0.6802 184\n", "max accuracy 0.667305 0.654032 210\n", "max precision 0.996238 0.85 1\n", "max absolute_MCC 0.667305 0.29437 210\n", "max min_per_class_accuracy 0.749175 0.637634 164" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.475092852495:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
NOYESErrorRate
NO269.0948.00.779 (948.0/1217.0)
YES85.01389.00.0577 (85.0/1474.0)
Total354.02337.00.3839 (1033.0/2691.0)
" ], "text/plain": [ " NO YES Error Rate\n", "----- ---- ----- ------- ---------------\n", "NO 269 948 0.779 (948.0/1217.0)\n", "YES 85 1389 0.0577 (85.0/1474.0)\n", "Total 354 2337 0.3839 (1033.0/2691.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "[[0.9962384300103982, 0.85]]\n", "[[0.6673053431289202, 0.6540319583797845]]\n", "0.693547371085\n" ] } ], "source": [ "print(\"\\n\\nWITH CLASS BALANCING\\n\")\n", "model(rf_bal, air_test)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.5" } }, "nbformat": 4, "nbformat_minor": 0 }