{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#----------------------------------------------------------------------\n",
"# Purpose: Condition an Airline dataset by filtering out NAs where the\n",
"# departure delay in the input dataset is unknown.\n",
"#\n",
"# Then treat anything longer than minutesOfDelayWeTolerate\n",
"# as delayed.\n",
"#----------------------------------------------------------------------"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import h2o\n",
"from h2o.estimators.gbm import H2OGradientBoostingEstimator"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
H2O cluster uptime: | \n",
"19 minutes 25 seconds 953 milliseconds |
\n",
"H2O cluster version: | \n",
"3.5.0.99999 |
\n",
"H2O cluster name: | \n",
"spIdea |
\n",
"H2O cluster total nodes: | \n",
"1 |
\n",
"H2O cluster total memory: | \n",
"12.44 GB |
\n",
"H2O cluster total cores: | \n",
"8 |
\n",
"H2O cluster allowed cores: | \n",
"8 |
\n",
"H2O cluster healthy: | \n",
"True |
\n",
"H2O Connection ip: | \n",
"127.0.0.1 |
\n",
"H2O Connection port: | \n",
"54321 |
"
],
"text/plain": [
"-------------------------- --------------------------------------\n",
"H2O cluster uptime: 19 minutes 25 seconds 953 milliseconds\n",
"H2O cluster version: 3.5.0.99999\n",
"H2O cluster name: spIdea\n",
"H2O cluster total nodes: 1\n",
"H2O cluster total memory: 12.44 GB\n",
"H2O cluster total cores: 8\n",
"H2O cluster allowed cores: 8\n",
"H2O cluster healthy: True\n",
"H2O Connection ip: 127.0.0.1\n",
"H2O Connection port: 54321\n",
"-------------------------- --------------------------------------"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"h2o.init()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Parse Progress: [##################################################] 100%\n"
]
}
],
"source": [
"from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.\n",
"\n",
"air = h2o.import_file(_locate(\"smalldata/airlines/allyears2k_headers.zip\"))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Original dataset rows: 43978, columns: 31\n",
"New dataset rows: 42892, columns: 31\n"
]
}
],
"source": [
"numRows, numCols = air.dim\n",
"print(\"Original dataset rows: {0}, columns: {1}\".format(numRows, numCols))\n",
"\n",
"x_cols = [\"Month\", \"DayofMonth\", \"DayOfWeek\", \"CRSDepTime\", \"CRSArrTime\", \"UniqueCarrier\", \"CRSElapsedTime\", \"Origin\", \"Dest\", \"Distance\"]\n",
"y_col = \"SynthDepDelayed\"\n",
"\n",
"noDepDelayedNAs = air[air[\"DepDelay\"].isna() == 0]\n",
"rows, cols = noDepDelayedNAs.dim\n",
"print(\"New dataset rows: {0}, columns: {1}\".format(rows, cols))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" Year | Month | DayofMonth | DayOfWeek | DepTime | CRSDepTime | ArrTime | CRSArrTime | UniqueCarrier | FlightNum | TailNum | ActualElapsedTime | CRSElapsedTime | AirTime | ArrDelay | DepDelay | Origin | Dest | Distance | TaxiIn | TaxiOut | Cancelled | CancellationCode | Diverted | CarrierDelay | WeatherDelay | NASDelay | SecurityDelay | LateAircraftDelay | IsArrDelayed | IsDepDelayed | SynthDepDelayed |
\n",
" 1987 | 10 | 14 | 3 | 741 | 730 | 912 | 849 | 5 | 1451 | 3499 | 91 | 79 | nan | 23 | 11 | 106 | 118 | 447 | nan | nan | 0 | 3 | 0 | nan | nan | nan | nan | nan | 1 | 1 | 1 |
\n",
" 1987 | 10 | 15 | 4 | 729 | 730 | 903 | 849 | 5 | 1451 | 3499 | 94 | 79 | nan | 14 | -1 | 106 | 118 | 447 | nan | nan | 0 | 3 | 0 | nan | nan | nan | nan | nan | 1 | 0 | 0 |
\n",
" 1987 | 10 | 17 | 6 | 741 | 730 | 918 | 849 | 5 | 1451 | 3499 | 97 | 79 | nan | 29 | 11 | 106 | 118 | 447 | nan | nan | 0 | 3 | 0 | nan | nan | nan | nan | nan | 1 | 1 | 1 |
\n",
" 1987 | 10 | 18 | 7 | 729 | 730 | 847 | 849 | 5 | 1451 | 3499 | 78 | 79 | nan | -2 | -1 | 106 | 118 | 447 | nan | nan | 0 | 3 | 0 | nan | nan | nan | nan | nan | 0 | 0 | 0 |
\n",
" 1987 | 10 | 19 | 1 | 749 | 730 | 922 | 849 | 5 | 1451 | 3499 | 93 | 79 | nan | 33 | 19 | 106 | 118 | 447 | nan | nan | 0 | 3 | 0 | nan | nan | nan | nan | nan | 1 | 1 | 1 |
\n",
" 1987 | 10 | 21 | 3 | 728 | 730 | 848 | 849 | 5 | 1451 | 3499 | 80 | 79 | nan | -1 | -2 | 106 | 118 | 447 | nan | nan | 0 | 3 | 0 | nan | nan | nan | nan | nan | 0 | 0 | 0 |
\n",
" 1987 | 10 | 22 | 4 | 728 | 730 | 852 | 849 | 5 | 1451 | 3499 | 84 | 79 | nan | 3 | -2 | 106 | 118 | 447 | nan | nan | 0 | 3 | 0 | nan | nan | nan | nan | nan | 1 | 0 | 0 |
\n",
" 1987 | 10 | 23 | 5 | 731 | 730 | 902 | 849 | 5 | 1451 | 3499 | 91 | 79 | nan | 13 | 1 | 106 | 118 | 447 | nan | nan | 0 | 3 | 0 | nan | nan | nan | nan | nan | 1 | 1 | 1 |
\n",
" 1987 | 10 | 24 | 6 | 744 | 730 | 908 | 849 | 5 | 1451 | 3499 | 84 | 79 | nan | 19 | 14 | 106 | 118 | 447 | nan | nan | 0 | 3 | 0 | nan | nan | nan | nan | nan | 1 | 1 | 1 |
\n",
" 1987 | 10 | 25 | 7 | 729 | 730 | 851 | 849 | 5 | 1451 | 3499 | 82 | 79 | nan | 2 | -1 | 106 | 118 | 447 | nan | nan | 0 | 3 | 0 | nan | nan | nan | nan | nan | 1 | 0 | 0 |
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": []
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"minutesOfDelayWeTolerate = 15\n",
"noDepDelayedNAs = noDepDelayedNAs.cbind(noDepDelayedNAs[\"DepDelay\"] > minutesOfDelayWeTolerate)\n",
"noDepDelayedNAs[numCols] = noDepDelayedNAs[numCols-1].asfactor()\n",
"noDepDelayedNAs.set_name(numCols,y_col)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"gbm Model Build Progress: [##################################################] 100%\n",
"Model Details\n",
"=============\n",
"H2OGradientBoostingEstimator : Gradient Boosting Machine\n",
"Model Key: GBM_model_python_1445841486633_37\n",
"\n",
"Model Summary:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"number_of_trees | \n",
"model_size_in_bytes | \n",
"min_depth | \n",
"max_depth | \n",
"mean_depth | \n",
"min_leaves | \n",
"max_leaves | \n",
"mean_leaves |
\n",
" | \n",
"50.0 | \n",
"34338.0 | \n",
"5.0 | \n",
"5.0 | \n",
"5.0 | \n",
"18.0 | \n",
"32.0 | \n",
"28.62 |
"
],
"text/plain": [
" number_of_trees model_size_in_bytes min_depth max_depth mean_depth min_leaves max_leaves mean_leaves\n",
"-- ----------------- --------------------- ----------- ----------- ------------ ------------ ------------ -------------\n",
" 50 34338 5 5 5 18 32 28.62"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"ModelMetricsBinomial: gbm\n",
"** Reported on train data. **\n",
"\n",
"MSE: 0.191672191035\n",
"R^2: 0.232789986813\n",
"LogLoss: 0.565710073073\n",
"AUC: 0.785428554449\n",
"Gini: 0.570857108897\n",
"\n",
"Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.412557029006:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"NO | \n",
"YES | \n",
"Error | \n",
"Rate |
\n",
"NO | \n",
"11180.0 | \n",
"9707.0 | \n",
"0.4647 | \n",
" (9707.0/20887.0) |
\n",
"YES | \n",
"3402.0 | \n",
"18603.0 | \n",
"0.1546 | \n",
" (3402.0/22005.0) |
\n",
"Total | \n",
"14582.0 | \n",
"28310.0 | \n",
"0.3056 | \n",
" (13109.0/42892.0) |
"
],
"text/plain": [
" NO YES Error Rate\n",
"----- ----- ----- ------- -----------------\n",
"NO 11180 9707 0.4647 (9707.0/20887.0)\n",
"YES 3402 18603 0.1546 (3402.0/22005.0)\n",
"Total 14582 28310 0.3056 (13109.0/42892.0)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Maximum Metrics: Maximum metrics at their respective thresholds\n",
"\n"
]
},
{
"data": {
"text/html": [
"metric | \n",
"threshold | \n",
"value | \n",
"idx |
\n",
"max f1 | \n",
"0.4 | \n",
"0.7 | \n",
"259.0 |
\n",
"max f2 | \n",
"0.2 | \n",
"0.8 | \n",
"347.0 |
\n",
"max f0point5 | \n",
"0.6 | \n",
"0.7 | \n",
"180.0 |
\n",
"max accuracy | \n",
"0.5 | \n",
"0.7 | \n",
"213.0 |
\n",
"max precision | \n",
"1.0 | \n",
"1.0 | \n",
"0.0 |
\n",
"max absolute_MCC | \n",
"0.5 | \n",
"0.4 | \n",
"213.0 |
\n",
"max min_per_class_accuracy | \n",
"0.5 | \n",
"0.7 | \n",
"209.0 |
"
],
"text/plain": [
"metric threshold value idx\n",
"-------------------------- ----------- -------- -----\n",
"max f1 0.412557 0.739461 259\n",
"max f2 0.236201 0.8477 347\n",
"max f0point5 0.559094 0.727325 180\n",
"max accuracy 0.500453 0.711578 213\n",
"max precision 0.956629 1 0\n",
"max absolute_MCC 0.500453 0.422602 213\n",
"max min_per_class_accuracy 0.508533 0.710825 209"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Scoring History:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"timestamp | \n",
"duration | \n",
"number_of_trees | \n",
"training_MSE | \n",
"training_logloss | \n",
"training_AUC | \n",
"training_classification_error |
\n",
" | \n",
"2015-10-25 23:57:36 | \n",
" 0.032 sec | \n",
"1.0 | \n",
"0.2 | \n",
"0.7 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-25 23:57:36 | \n",
" 0.059 sec | \n",
"2.0 | \n",
"0.2 | \n",
"0.7 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-25 23:57:36 | \n",
" 0.089 sec | \n",
"3.0 | \n",
"0.2 | \n",
"0.7 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-25 23:57:36 | \n",
" 0.117 sec | \n",
"4.0 | \n",
"0.2 | \n",
"0.7 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-25 23:57:36 | \n",
" 0.152 sec | \n",
"5.0 | \n",
"0.2 | \n",
"0.7 | \n",
"0.7 | \n",
"0.4 |
\n",
"--- | \n",
"--- | \n",
"--- | \n",
"--- | \n",
"--- | \n",
"--- | \n",
"--- | \n",
"--- |
\n",
" | \n",
"2015-10-25 23:57:38 | \n",
" 2.267 sec | \n",
"46.0 | \n",
"0.2 | \n",
"0.6 | \n",
"0.8 | \n",
"0.3 |
\n",
" | \n",
"2015-10-25 23:57:38 | \n",
" 2.323 sec | \n",
"47.0 | \n",
"0.2 | \n",
"0.6 | \n",
"0.8 | \n",
"0.3 |
\n",
" | \n",
"2015-10-25 23:57:38 | \n",
" 2.378 sec | \n",
"48.0 | \n",
"0.2 | \n",
"0.6 | \n",
"0.8 | \n",
"0.3 |
\n",
" | \n",
"2015-10-25 23:57:38 | \n",
" 2.435 sec | \n",
"49.0 | \n",
"0.2 | \n",
"0.6 | \n",
"0.8 | \n",
"0.3 |
\n",
" | \n",
"2015-10-25 23:57:38 | \n",
" 2.493 sec | \n",
"50.0 | \n",
"0.2 | \n",
"0.6 | \n",
"0.8 | \n",
"0.3 |
"
],
"text/plain": [
" timestamp duration number_of_trees training_MSE training_logloss training_AUC training_classification_error\n",
"--- ------------------- ---------- ----------------- -------------- ------------------ -------------- -------------------------------\n",
" 2015-10-25 23:57:36 0.032 sec 1.0 0.244362718818 0.681856365041 0.692503203228 0.413433740558\n",
" 2015-10-25 23:57:36 0.059 sec 2.0 0.239916651379 0.672915590687 0.700446640048 0.410845845379\n",
" 2015-10-25 23:57:36 0.089 sec 3.0 0.235500532513 0.663968419721 0.712157594375 0.391051944419\n",
" 2015-10-25 23:57:36 0.117 sec 4.0 0.231804609029 0.656396477796 0.717626212056 0.387158444465\n",
" 2015-10-25 23:57:36 0.152 sec 5.0 0.228800636002 0.650232244088 0.72532125588 0.377040007461\n",
"--- --- --- --- --- --- --- ---\n",
" 2015-10-25 23:57:38 2.267 sec 46.0 0.192702810811 0.568263251703 0.783306118911 0.305721346638\n",
" 2015-10-25 23:57:38 2.323 sec 47.0 0.192356155909 0.567421568826 0.783968341044 0.304695514315\n",
" 2015-10-25 23:57:38 2.378 sec 48.0 0.192132888018 0.566844161055 0.784382761975 0.302247505362\n",
" 2015-10-25 23:57:38 2.435 sec 49.0 0.191914363235 0.566306525033 0.784868956573 0.306327520284\n",
" 2015-10-25 23:57:38 2.493 sec 50.0 0.191672191035 0.565710073073 0.785428554449 0.305628089154"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Variable Importances:\n"
]
},
{
"data": {
"text/html": [
"variable | \n",
"relative_importance | \n",
"scaled_importance | \n",
"percentage |
\n",
"Origin | \n",
"6877.3 | \n",
"1.0 | \n",
"0.4 |
\n",
"Dest | \n",
"4551.0 | \n",
"0.7 | \n",
"0.3 |
\n",
"DayofMonth | \n",
"2025.6 | \n",
"0.3 | \n",
"0.1 |
\n",
"UniqueCarrier | \n",
"1279.5 | \n",
"0.2 | \n",
"0.1 |
\n",
"CRSArrTime | \n",
"724.8 | \n",
"0.1 | \n",
"0.0 |
\n",
"CRSDepTime | \n",
"636.9 | \n",
"0.1 | \n",
"0.0 |
\n",
"DayOfWeek | \n",
"408.2 | \n",
"0.1 | \n",
"0.0 |
\n",
"CRSElapsedTime | \n",
"118.8 | \n",
"0.0 | \n",
"0.0 |
\n",
"Month | \n",
"73.3 | \n",
"0.0 | \n",
"0.0 |
\n",
"Distance | \n",
"31.1 | \n",
"0.0 | \n",
"0.0 |
"
],
"text/plain": [
"variable relative_importance scaled_importance percentage\n",
"-------------- --------------------- ------------------- ------------\n",
"Origin 6877.27 1 0.411159\n",
"Dest 4551.03 0.66175 0.272085\n",
"DayofMonth 2025.55 0.294529 0.121098\n",
"UniqueCarrier 1279.5 0.186048 0.0764954\n",
"CRSArrTime 724.814 0.105393 0.0433331\n",
"CRSDepTime 636.901 0.0926096 0.0380773\n",
"DayOfWeek 408.238 0.0593605 0.0244066\n",
"CRSElapsedTime 118.821 0.0172773 0.00710371\n",
"Month 73.2614 0.0106527 0.00437995\n",
"Distance 31.1477 0.00452908 0.00186217"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"gbm = H2OGradientBoostingEstimator(distribution=\"bernoulli\")\n",
"gbm.train(x=x_cols, y=y_col, training_frame = noDepDelayedNAs)\n",
"gbm.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 0
}