{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#----------------------------------------------------------------------\n", "# Purpose: Condition an Airline dataset by filtering out NAs where the\n", "# departure delay in the input dataset is unknown.\n", "#\n", "# Then treat anything longer than minutesOfDelayWeTolerate\n", "# as delayed.\n", "#----------------------------------------------------------------------" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import h2o\n", "from h2o.estimators.gbm import H2OGradientBoostingEstimator" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
H2O cluster uptime: 19 minutes 25 seconds 953 milliseconds
H2O cluster version: 3.5.0.99999
H2O cluster name: spIdea
H2O cluster total nodes: 1
H2O cluster total memory: 12.44 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: True
H2O Connection ip: 127.0.0.1
H2O Connection port: 54321
" ], "text/plain": [ "-------------------------- --------------------------------------\n", "H2O cluster uptime: 19 minutes 25 seconds 953 milliseconds\n", "H2O cluster version: 3.5.0.99999\n", "H2O cluster name: spIdea\n", "H2O cluster total nodes: 1\n", "H2O cluster total memory: 12.44 GB\n", "H2O cluster total cores: 8\n", "H2O cluster allowed cores: 8\n", "H2O cluster healthy: True\n", "H2O Connection ip: 127.0.0.1\n", "H2O Connection port: 54321\n", "-------------------------- --------------------------------------" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "h2o.init()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Parse Progress: [##################################################] 100%\n" ] } ], "source": [ "from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.\n", "\n", "air = h2o.import_file(_locate(\"smalldata/airlines/allyears2k_headers.zip\"))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original dataset rows: 43978, columns: 31\n", "New dataset rows: 42892, columns: 31\n" ] } ], "source": [ "numRows, numCols = air.dim\n", "print(\"Original dataset rows: {0}, columns: {1}\".format(numRows, numCols))\n", "\n", "x_cols = [\"Month\", \"DayofMonth\", \"DayOfWeek\", \"CRSDepTime\", \"CRSArrTime\", \"UniqueCarrier\", \"CRSElapsedTime\", \"Origin\", \"Dest\", \"Distance\"]\n", "y_col = \"SynthDepDelayed\"\n", "\n", "noDepDelayedNAs = air[air[\"DepDelay\"].isna() == 0]\n", "rows, cols = noDepDelayedNAs.dim\n", "print(\"New dataset rows: {0}, columns: {1}\".format(rows, cols))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime UniqueCarrier FlightNum TailNum ActualElapsedTime CRSElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay IsArrDelayed IsDepDelayed SynthDepDelayed
1987 10 14 3 741 730 912 849 5 1451 3499 91 79 nan 23 11 106 118 447 nan nan 0 3 0 nan nan nan nan nan 1 1 1
1987 10 15 4 729 730 903 849 5 1451 3499 94 79 nan 14 -1 106 118 447 nan nan 0 3 0 nan nan nan nan nan 1 0 0
1987 10 17 6 741 730 918 849 5 1451 3499 97 79 nan 29 11 106 118 447 nan nan 0 3 0 nan nan nan nan nan 1 1 1
1987 10 18 7 729 730 847 849 5 1451 3499 78 79 nan -2 -1 106 118 447 nan nan 0 3 0 nan nan nan nan nan 0 0 0
1987 10 19 1 749 730 922 849 5 1451 3499 93 79 nan 33 19 106 118 447 nan nan 0 3 0 nan nan nan nan nan 1 1 1
1987 10 21 3 728 730 848 849 5 1451 3499 80 79 nan -1 -2 106 118 447 nan nan 0 3 0 nan nan nan nan nan 0 0 0
1987 10 22 4 728 730 852 849 5 1451 3499 84 79 nan 3 -2 106 118 447 nan nan 0 3 0 nan nan nan nan nan 1 0 0
1987 10 23 5 731 730 902 849 5 1451 3499 91 79 nan 13 1 106 118 447 nan nan 0 3 0 nan nan nan nan nan 1 1 1
1987 10 24 6 744 730 908 849 5 1451 3499 84 79 nan 19 14 106 118 447 nan nan 0 3 0 nan nan nan nan nan 1 1 1
1987 10 25 7 729 730 851 849 5 1451 3499 82 79 nan 2 -1 106 118 447 nan nan 0 3 0 nan nan nan nan nan 1 0 0
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "minutesOfDelayWeTolerate = 15\n", "noDepDelayedNAs = noDepDelayedNAs.cbind(noDepDelayedNAs[\"DepDelay\"] > minutesOfDelayWeTolerate)\n", "noDepDelayedNAs[numCols] = noDepDelayedNAs[numCols-1].asfactor()\n", "noDepDelayedNAs.set_name(numCols,y_col)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "gbm Model Build Progress: [##################################################] 100%\n", "Model Details\n", "=============\n", "H2OGradientBoostingEstimator : Gradient Boosting Machine\n", "Model Key: GBM_model_python_1445841486633_37\n", "\n", "Model Summary:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
number_of_treesmodel_size_in_bytesmin_depthmax_depthmean_depthmin_leavesmax_leavesmean_leaves
50.034338.05.05.05.018.032.028.62
" ], "text/plain": [ " number_of_trees model_size_in_bytes min_depth max_depth mean_depth min_leaves max_leaves mean_leaves\n", "-- ----------------- --------------------- ----------- ----------- ------------ ------------ ------------ -------------\n", " 50 34338 5 5 5 18 32 28.62" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "ModelMetricsBinomial: gbm\n", "** Reported on train data. **\n", "\n", "MSE: 0.191672191035\n", "R^2: 0.232789986813\n", "LogLoss: 0.565710073073\n", "AUC: 0.785428554449\n", "Gini: 0.570857108897\n", "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.412557029006:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
NOYESErrorRate
NO11180.09707.00.4647 (9707.0/20887.0)
YES3402.018603.00.1546 (3402.0/22005.0)
Total14582.028310.00.3056 (13109.0/42892.0)
" ], "text/plain": [ " NO YES Error Rate\n", "----- ----- ----- ------- -----------------\n", "NO 11180 9707 0.4647 (9707.0/20887.0)\n", "YES 3402 18603 0.1546 (3402.0/22005.0)\n", "Total 14582 28310 0.3056 (13109.0/42892.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Maximum Metrics: Maximum metrics at their respective thresholds\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
metricthresholdvalueidx
max f10.40.7259.0
max f20.20.8347.0
max f0point50.60.7180.0
max accuracy0.50.7213.0
max precision1.01.00.0
max absolute_MCC0.50.4213.0
max min_per_class_accuracy0.50.7209.0
" ], "text/plain": [ "metric threshold value idx\n", "-------------------------- ----------- -------- -----\n", "max f1 0.412557 0.739461 259\n", "max f2 0.236201 0.8477 347\n", "max f0point5 0.559094 0.727325 180\n", "max accuracy 0.500453 0.711578 213\n", "max precision 0.956629 1 0\n", "max absolute_MCC 0.500453 0.422602 213\n", "max min_per_class_accuracy 0.508533 0.710825 209" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Scoring History:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
timestampdurationnumber_of_treestraining_MSEtraining_loglosstraining_AUCtraining_classification_error
2015-10-25 23:57:36 0.032 sec1.00.20.70.70.4
2015-10-25 23:57:36 0.059 sec2.00.20.70.70.4
2015-10-25 23:57:36 0.089 sec3.00.20.70.70.4
2015-10-25 23:57:36 0.117 sec4.00.20.70.70.4
2015-10-25 23:57:36 0.152 sec5.00.20.70.70.4
------------------------
2015-10-25 23:57:38 2.267 sec46.00.20.60.80.3
2015-10-25 23:57:38 2.323 sec47.00.20.60.80.3
2015-10-25 23:57:38 2.378 sec48.00.20.60.80.3
2015-10-25 23:57:38 2.435 sec49.00.20.60.80.3
2015-10-25 23:57:38 2.493 sec50.00.20.60.80.3
" ], "text/plain": [ " timestamp duration number_of_trees training_MSE training_logloss training_AUC training_classification_error\n", "--- ------------------- ---------- ----------------- -------------- ------------------ -------------- -------------------------------\n", " 2015-10-25 23:57:36 0.032 sec 1.0 0.244362718818 0.681856365041 0.692503203228 0.413433740558\n", " 2015-10-25 23:57:36 0.059 sec 2.0 0.239916651379 0.672915590687 0.700446640048 0.410845845379\n", " 2015-10-25 23:57:36 0.089 sec 3.0 0.235500532513 0.663968419721 0.712157594375 0.391051944419\n", " 2015-10-25 23:57:36 0.117 sec 4.0 0.231804609029 0.656396477796 0.717626212056 0.387158444465\n", " 2015-10-25 23:57:36 0.152 sec 5.0 0.228800636002 0.650232244088 0.72532125588 0.377040007461\n", "--- --- --- --- --- --- --- ---\n", " 2015-10-25 23:57:38 2.267 sec 46.0 0.192702810811 0.568263251703 0.783306118911 0.305721346638\n", " 2015-10-25 23:57:38 2.323 sec 47.0 0.192356155909 0.567421568826 0.783968341044 0.304695514315\n", " 2015-10-25 23:57:38 2.378 sec 48.0 0.192132888018 0.566844161055 0.784382761975 0.302247505362\n", " 2015-10-25 23:57:38 2.435 sec 49.0 0.191914363235 0.566306525033 0.784868956573 0.306327520284\n", " 2015-10-25 23:57:38 2.493 sec 50.0 0.191672191035 0.565710073073 0.785428554449 0.305628089154" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Variable Importances:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
variablerelative_importancescaled_importancepercentage
Origin6877.31.00.4
Dest4551.00.70.3
DayofMonth2025.60.30.1
UniqueCarrier1279.50.20.1
CRSArrTime724.80.10.0
CRSDepTime636.90.10.0
DayOfWeek408.20.10.0
CRSElapsedTime118.80.00.0
Month73.30.00.0
Distance31.10.00.0
" ], "text/plain": [ "variable relative_importance scaled_importance percentage\n", "-------------- --------------------- ------------------- ------------\n", "Origin 6877.27 1 0.411159\n", "Dest 4551.03 0.66175 0.272085\n", "DayofMonth 2025.55 0.294529 0.121098\n", "UniqueCarrier 1279.5 0.186048 0.0764954\n", "CRSArrTime 724.814 0.105393 0.0433331\n", "CRSDepTime 636.901 0.0926096 0.0380773\n", "DayOfWeek 408.238 0.0593605 0.0244066\n", "CRSElapsedTime 118.821 0.0172773 0.00710371\n", "Month 73.2614 0.0106527 0.00437995\n", "Distance 31.1477 0.00452908 0.00186217" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "gbm = H2OGradientBoostingEstimator(distribution=\"bernoulli\")\n", "gbm.train(x=x_cols, y=y_col, training_frame = noDepDelayedNAs)\n", "gbm.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }