{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# This is a demo of H2O's GLM function\n",
"# It imports a data set, parses it, and prints a summary\n",
"# Then, it runs GLM with a binomial link function\n",
"import h2o\n",
"from h2o.estimators.random_forest import H2ORandomForestEstimator"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Warning: Version mismatch. H2O is version 3.5.0.99999, but the python package is version UNKNOWN.\n"
]
},
{
"data": {
"text/html": [
"
H2O cluster uptime: | \n",
"44 minutes 50 seconds 74 milliseconds |
\n",
"H2O cluster version: | \n",
"3.5.0.99999 |
\n",
"H2O cluster name: | \n",
"ludirehak |
\n",
"H2O cluster total nodes: | \n",
"1 |
\n",
"H2O cluster total memory: | \n",
"3.56 GB |
\n",
"H2O cluster total cores: | \n",
"8 |
\n",
"H2O cluster allowed cores: | \n",
"8 |
\n",
"H2O cluster healthy: | \n",
"True |
\n",
"H2O Connection ip: | \n",
"127.0.0.1 |
\n",
"H2O Connection port: | \n",
"54321 |
"
],
"text/plain": [
"-------------------------- -------------------------------------\n",
"H2O cluster uptime: 44 minutes 50 seconds 74 milliseconds\n",
"H2O cluster version: 3.5.0.99999\n",
"H2O cluster name: ludirehak\n",
"H2O cluster total nodes: 1\n",
"H2O cluster total memory: 3.56 GB\n",
"H2O cluster total cores: 8\n",
"H2O cluster allowed cores: 8\n",
"H2O cluster healthy: True\n",
"H2O Connection ip: 127.0.0.1\n",
"H2O Connection port: 54321\n",
"-------------------------- -------------------------------------"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"h2o.init()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Parse Progress: [##################################################] 100%\n",
"Uploaded pya01a74e5-0aa6-4ef0-ae1a-0d3fe860eee9 into cluster with 24,421 rows and 12 cols\n"
]
}
],
"source": [
"from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.\n",
"\n",
"air = h2o.upload_file(path=_locate(\"smalldata/airlines/AirlinesTrain.csv.zip\"))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"r = air[0].runif()\n",
"air_train = air[r < 0.8]\n",
"air_valid = air[r >= 0.8]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"myX = [\"Origin\", \"Dest\", \"Distance\", \"UniqueCarrier\", \"fMonth\", \"fDayofMonth\", \"fDayOfWeek\"]\n",
"myY = \"IsDepDelayed\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"drf Model Build Progress: [##################################################] 100%\n",
"Model Details\n",
"=============\n",
"H2ORandomForestEstimator : Distributed RF\n",
"Model Key: DRF_model_python_1445557087082_2742\n",
"\n",
"Model Summary:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"number_of_trees | \n",
"model_size_in_bytes | \n",
"min_depth | \n",
"max_depth | \n",
"mean_depth | \n",
"min_leaves | \n",
"max_leaves | \n",
"mean_leaves |
\n",
" | \n",
"10.0 | \n",
"287650.0 | \n",
"20.0 | \n",
"20.0 | \n",
"20.0 | \n",
"1664.0 | \n",
"2418.0 | \n",
"2103.5 |
"
],
"text/plain": [
" number_of_trees model_size_in_bytes min_depth max_depth mean_depth min_leaves max_leaves mean_leaves\n",
"-- ----------------- --------------------- ----------- ----------- ------------ ------------ ------------ -------------\n",
" 10 287650 20 20 20 1664 2418 2103.5"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"ModelMetricsBinomial: drf\n",
"** Reported on train data. **\n",
"\n",
"MSE: 0.269503006052\n",
"R^2: -0.0873991649123\n",
"LogLoss: 2.43382549553\n",
"AUC: 0.646622642412\n",
"Gini: 0.293245284825\n",
"\n",
"Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.402941766395:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"NO | \n",
"YES | \n",
"Error | \n",
"Rate |
\n",
"NO | \n",
"1948.0 | \n",
"6780.0 | \n",
"0.7768 | \n",
" (6780.0/8728.0) |
\n",
"YES | \n",
"936.0 | \n",
"9580.0 | \n",
"0.089 | \n",
" (936.0/10516.0) |
\n",
"Total | \n",
"2884.0 | \n",
"16360.0 | \n",
"0.401 | \n",
" (7716.0/19244.0) |
"
],
"text/plain": [
" NO YES Error Rate\n",
"----- ---- ----- ------- ----------------\n",
"NO 1948 6780 0.7768 (6780.0/8728.0)\n",
"YES 936 9580 0.089 (936.0/10516.0)\n",
"Total 2884 16360 0.401 (7716.0/19244.0)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Maximum Metrics: Maximum metrics at their respective thresholds\n",
"\n"
]
},
{
"data": {
"text/html": [
"metric | \n",
"threshold | \n",
"value | \n",
"idx |
\n",
"max f1 | \n",
"0.4 | \n",
"0.7 | \n",
"299.0 |
\n",
"max f2 | \n",
"0.0 | \n",
"0.9 | \n",
"399.0 |
\n",
"max f0point5 | \n",
"0.6 | \n",
"0.7 | \n",
"190.0 |
\n",
"max accuracy | \n",
"0.6 | \n",
"0.6 | \n",
"193.0 |
\n",
"max precision | \n",
"0.9 | \n",
"0.7 | \n",
"30.0 |
\n",
"max absolute_MCC | \n",
"0.6 | \n",
"0.2 | \n",
"190.0 |
\n",
"max min_per_class_accuracy | \n",
"0.7 | \n",
"0.6 | \n",
"140.0 |
"
],
"text/plain": [
"metric threshold value idx\n",
"-------------------------- ----------- -------- -----\n",
"max f1 0.402942 0.712904 299\n",
"max f2 0 0.857637 399\n",
"max f0point5 0.649503 0.653173 190\n",
"max accuracy 0.643019 0.624662 193\n",
"max precision 0.938886 0.697052 30\n",
"max absolute_MCC 0.649503 0.234208 190\n",
"max min_per_class_accuracy 0.737524 0.611449 140"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"ModelMetricsBinomial: drf\n",
"** Reported on validation data. **\n",
"\n",
"MSE: 0.245293478794\n",
"R^2: 0.00968032826017\n",
"LogLoss: 0.758757679035\n",
"AUC: 0.685987609758\n",
"Gini: 0.371975219515\n",
"\n",
"Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.42132409513:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"NO | \n",
"YES | \n",
"Error | \n",
"Rate |
\n",
"NO | \n",
"467.0 | \n",
"1781.0 | \n",
"0.7923 | \n",
" (1781.0/2248.0) |
\n",
"YES | \n",
"160.0 | \n",
"2566.0 | \n",
"0.0587 | \n",
" (160.0/2726.0) |
\n",
"Total | \n",
"627.0 | \n",
"4347.0 | \n",
"0.3902 | \n",
" (1941.0/4974.0) |
"
],
"text/plain": [
" NO YES Error Rate\n",
"----- ---- ----- ------- ---------------\n",
"NO 467 1781 0.7923 (1781.0/2248.0)\n",
"YES 160 2566 0.0587 (160.0/2726.0)\n",
"Total 627 4347 0.3902 (1941.0/4974.0)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Maximum Metrics: Maximum metrics at their respective thresholds\n",
"\n"
]
},
{
"data": {
"text/html": [
"metric | \n",
"threshold | \n",
"value | \n",
"idx |
\n",
"max f1 | \n",
"0.4 | \n",
"0.7 | \n",
"315.0 |
\n",
"max f2 | \n",
"0.2 | \n",
"0.9 | \n",
"396.0 |
\n",
"max f0point5 | \n",
"0.7 | \n",
"0.7 | \n",
"174.0 |
\n",
"max accuracy | \n",
"0.7 | \n",
"0.6 | \n",
"200.0 |
\n",
"max precision | \n",
"1.0 | \n",
"0.9 | \n",
"0.0 |
\n",
"max absolute_MCC | \n",
"0.7 | \n",
"0.3 | \n",
"174.0 |
\n",
"max min_per_class_accuracy | \n",
"0.7 | \n",
"0.6 | \n",
"165.0 |
"
],
"text/plain": [
"metric threshold value idx\n",
"-------------------------- ----------- -------- -----\n",
"max f1 0.421324 0.725576 315\n",
"max f2 0.150007 0.858637 396\n",
"max f0point5 0.711122 0.674532 174\n",
"max accuracy 0.668563 0.645758 200\n",
"max precision 1 0.907895 0\n",
"max absolute_MCC 0.711122 0.279686 174\n",
"max min_per_class_accuracy 0.726159 0.63573 165"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Scoring History:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"timestamp | \n",
"duration | \n",
"number_of_trees | \n",
"training_MSE | \n",
"training_logloss | \n",
"training_AUC | \n",
"training_classification_error | \n",
"validation_MSE | \n",
"validation_logloss | \n",
"validation_AUC | \n",
"validation_classification_error |
\n",
" | \n",
"2015-10-22 17:22:58 | \n",
" 0.074 sec | \n",
"1.0 | \n",
"0.3 | \n",
"8.4 | \n",
"0.6 | \n",
"0.4 | \n",
"0.3 | \n",
"8.1 | \n",
"0.6 | \n",
"0.5 |
\n",
" | \n",
"2015-10-22 17:22:58 | \n",
" 0.163 sec | \n",
"2.0 | \n",
"0.3 | \n",
"7.4 | \n",
"0.6 | \n",
"0.4 | \n",
"0.3 | \n",
"4.0 | \n",
"0.6 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 17:22:58 | \n",
" 0.245 sec | \n",
"3.0 | \n",
"0.3 | \n",
"6.5 | \n",
"0.6 | \n",
"0.4 | \n",
"0.3 | \n",
"2.6 | \n",
"0.6 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 17:22:58 | \n",
" 0.311 sec | \n",
"4.0 | \n",
"0.3 | \n",
"5.6 | \n",
"0.6 | \n",
"0.5 | \n",
"0.3 | \n",
"1.9 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 17:22:58 | \n",
" 0.391 sec | \n",
"5.0 | \n",
"0.3 | \n",
"4.8 | \n",
"0.6 | \n",
"0.4 | \n",
"0.3 | \n",
"1.4 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 17:22:58 | \n",
" 0.480 sec | \n",
"6.0 | \n",
"0.3 | \n",
"4.0 | \n",
"0.6 | \n",
"0.4 | \n",
"0.3 | \n",
"1.1 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 17:22:58 | \n",
" 0.565 sec | \n",
"7.0 | \n",
"0.3 | \n",
"3.6 | \n",
"0.6 | \n",
"0.4 | \n",
"0.2 | \n",
"1.0 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 17:22:58 | \n",
" 0.659 sec | \n",
"8.0 | \n",
"0.3 | \n",
"3.1 | \n",
"0.6 | \n",
"0.4 | \n",
"0.2 | \n",
"0.9 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 17:22:58 | \n",
" 0.751 sec | \n",
"9.0 | \n",
"0.3 | \n",
"2.7 | \n",
"0.6 | \n",
"0.4 | \n",
"0.2 | \n",
"0.8 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 17:22:58 | \n",
" 0.851 sec | \n",
"10.0 | \n",
"0.3 | \n",
"2.4 | \n",
"0.6 | \n",
"0.4 | \n",
"0.2 | \n",
"0.8 | \n",
"0.7 | \n",
"0.4 |
"
],
"text/plain": [
" timestamp duration number_of_trees training_MSE training_logloss training_AUC training_classification_error validation_MSE validation_logloss validation_AUC validation_classification_error\n",
"-- ------------------- ---------- ----------------- -------------- ------------------ -------------- ------------------------------- ---------------- -------------------- ---------------- ---------------------------------\n",
" 2015-10-22 17:22:58 0.074 sec 1 0.336005 8.38071 0.593241 0.447603 0.328738 8.12159 0.601799 0.45195\n",
" 2015-10-22 17:22:58 0.163 sec 2 0.324066 7.38312 0.593177 0.448151 0.284953 3.9759 0.623903 0.400483\n",
" 2015-10-22 17:22:58 0.245 sec 3 0.313138 6.50304 0.604313 0.449285 0.271995 2.58926 0.637956 0.414154\n",
" 2015-10-22 17:22:58 0.311 sec 4 0.303166 5.59741 0.612553 0.4509 0.262972 1.87702 0.651933 0.415963\n",
" 2015-10-22 17:22:58 0.391 sec 5 0.293768 4.79832 0.621609 0.423103 0.256723 1.38846 0.662515 0.392843\n",
" 2015-10-22 17:22:58 0.480 sec 6 0.285971 4.01094 0.629809 0.412818 0.251944 1.11198 0.67238 0.388219\n",
" 2015-10-22 17:22:58 0.565 sec 7 0.28125 3.55467 0.636323 0.399323 0.249657 0.991554 0.67852 0.382589\n",
" 2015-10-22 17:22:58 0.659 sec 8 0.277031 3.11835 0.639363 0.399567 0.246953 0.880982 0.682305 0.386811\n",
" 2015-10-22 17:22:58 0.751 sec 9 0.271668 2.73216 0.645331 0.39977 0.245542 0.831393 0.68429 0.384801\n",
" 2015-10-22 17:22:58 0.851 sec 10 0.269503 2.43383 0.646623 0.400956 0.245293 0.758758 0.685988 0.390229"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Variable Importances:\n"
]
},
{
"data": {
"text/html": [
"variable | \n",
"relative_importance | \n",
"scaled_importance | \n",
"percentage |
\n",
"Origin | \n",
"6152.2 | \n",
"1.0 | \n",
"0.3 |
\n",
"fDayofMonth | \n",
"5583.6 | \n",
"0.9 | \n",
"0.3 |
\n",
"Dest | \n",
"4203.4 | \n",
"0.7 | \n",
"0.2 |
\n",
"UniqueCarrier | \n",
"1609.3 | \n",
"0.3 | \n",
"0.1 |
\n",
"fDayOfWeek | \n",
"1556.2 | \n",
"0.3 | \n",
"0.1 |
\n",
"Distance | \n",
"1493.0 | \n",
"0.2 | \n",
"0.1 |
\n",
"fMonth | \n",
"131.7 | \n",
"0.0 | \n",
"0.0 |
"
],
"text/plain": [
"variable relative_importance scaled_importance percentage\n",
"------------- --------------------- ------------------- ------------\n",
"Origin 6152.21 1 0.296788\n",
"fDayofMonth 5583.59 0.907575 0.269357\n",
"Dest 4203.39 0.683233 0.202775\n",
"UniqueCarrier 1609.28 0.261578 0.077633\n",
"fDayOfWeek 1556.19 0.252948 0.0750719\n",
"Distance 1492.99 0.242675 0.072023\n",
"fMonth 131.683 0.0214043 0.00635252"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"rf_no_bal = H2ORandomForestEstimator(seed=12, ntrees=10, max_depth=20, balance_classes=False)\n",
"rf_no_bal.train(x=myX, y=myY, training_frame=air_train, validation_frame=air_valid)\n",
"rf_no_bal.show()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"drf Model Build Progress: [##################################################] 100%\n",
"Model Details\n",
"=============\n",
"H2ORandomForestEstimator : Distributed RF\n",
"Model Key: DRF_model_python_1445557087082_2744\n",
"\n",
"Model Summary:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"number_of_trees | \n",
"model_size_in_bytes | \n",
"min_depth | \n",
"max_depth | \n",
"mean_depth | \n",
"min_leaves | \n",
"max_leaves | \n",
"mean_leaves |
\n",
" | \n",
"10.0 | \n",
"299144.0 | \n",
"20.0 | \n",
"20.0 | \n",
"20.0 | \n",
"1750.0 | \n",
"2460.0 | \n",
"2168.2 |
"
],
"text/plain": [
" number_of_trees model_size_in_bytes min_depth max_depth mean_depth min_leaves max_leaves mean_leaves\n",
"-- ----------------- --------------------- ----------- ----------- ------------ ------------ ------------ -------------\n",
" 10 299144 20 20 20 1750 2460 2168.2"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"ModelMetricsBinomial: drf\n",
"** Reported on train data. **\n",
"\n",
"MSE: 0.268874582249\n",
"R^2: -0.0754992978501\n",
"LogLoss: 2.09200342169\n",
"AUC: 0.685292136376\n",
"Gini: 0.370584272753\n",
"\n",
"Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.538182890839:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"NO | \n",
"YES | \n",
"Error | \n",
"Rate |
\n",
"NO | \n",
"3925.0 | \n",
"6621.0 | \n",
"0.6278 | \n",
" (6621.0/10546.0) |
\n",
"YES | \n",
"1574.0 | \n",
"8952.0 | \n",
"0.1495 | \n",
" (1574.0/10526.0) |
\n",
"Total | \n",
"5499.0 | \n",
"15573.0 | \n",
"0.3889 | \n",
" (8195.0/21072.0) |
"
],
"text/plain": [
" NO YES Error Rate\n",
"----- ---- ----- ------- ----------------\n",
"NO 3925 6621 0.6278 (6621.0/10546.0)\n",
"YES 1574 8952 0.1495 (1574.0/10526.0)\n",
"Total 5499 15573 0.3889 (8195.0/21072.0)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Maximum Metrics: Maximum metrics at their respective thresholds\n",
"\n"
]
},
{
"data": {
"text/html": [
"metric | \n",
"threshold | \n",
"value | \n",
"idx |
\n",
"max f1 | \n",
"0.5 | \n",
"0.7 | \n",
"226.0 |
\n",
"max f2 | \n",
"0.0 | \n",
"0.8 | \n",
"399.0 |
\n",
"max f0point5 | \n",
"0.8 | \n",
"0.6 | \n",
"124.0 |
\n",
"max accuracy | \n",
"0.7 | \n",
"0.6 | \n",
"140.0 |
\n",
"max precision | \n",
"0.9 | \n",
"0.7 | \n",
"28.0 |
\n",
"max absolute_MCC | \n",
"0.7 | \n",
"0.3 | \n",
"151.0 |
\n",
"max min_per_class_accuracy | \n",
"0.7 | \n",
"0.6 | \n",
"140.0 |
"
],
"text/plain": [
"metric threshold value idx\n",
"-------------------------- ----------- -------- -----\n",
"max f1 0.538183 0.686003 226\n",
"max f2 0 0.83307 399\n",
"max f0point5 0.761166 0.646627 124\n",
"max accuracy 0.731244 0.645976 140\n",
"max precision 0.939299 0.70679 28\n",
"max absolute_MCC 0.707454 0.292387 151\n",
"max min_per_class_accuracy 0.731244 0.645069 140"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"ModelMetricsBinomial: drf\n",
"** Reported on validation data. **\n",
"\n",
"MSE: 0.249809873778\n",
"R^2: -0.00855364526058\n",
"LogLoss: 0.770654128805\n",
"AUC: 0.682375448104\n",
"Gini: 0.364750896207\n",
"\n",
"Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.56328826827:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"NO | \n",
"YES | \n",
"Error | \n",
"Rate |
\n",
"NO | \n",
"822.0 | \n",
"1426.0 | \n",
"0.6343 | \n",
" (1426.0/2248.0) |
\n",
"YES | \n",
"367.0 | \n",
"2359.0 | \n",
"0.1346 | \n",
" (367.0/2726.0) |
\n",
"Total | \n",
"1189.0 | \n",
"3785.0 | \n",
"0.3605 | \n",
" (1793.0/4974.0) |
"
],
"text/plain": [
" NO YES Error Rate\n",
"----- ---- ----- ------- ---------------\n",
"NO 822 1426 0.6343 (1426.0/2248.0)\n",
"YES 367 2359 0.1346 (367.0/2726.0)\n",
"Total 1189 3785 0.3605 (1793.0/4974.0)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Maximum Metrics: Maximum metrics at their respective thresholds\n",
"\n"
]
},
{
"data": {
"text/html": [
"metric | \n",
"threshold | \n",
"value | \n",
"idx |
\n",
"max f1 | \n",
"0.6 | \n",
"0.7 | \n",
"261.0 |
\n",
"max f2 | \n",
"0.1 | \n",
"0.9 | \n",
"399.0 |
\n",
"max f0point5 | \n",
"0.7 | \n",
"0.7 | \n",
"179.0 |
\n",
"max accuracy | \n",
"0.6 | \n",
"0.6 | \n",
"235.0 |
\n",
"max precision | \n",
"1.0 | \n",
"0.8 | \n",
"6.0 |
\n",
"max absolute_MCC | \n",
"0.7 | \n",
"0.3 | \n",
"194.0 |
\n",
"max min_per_class_accuracy | \n",
"0.7 | \n",
"0.6 | \n",
"167.0 |
"
],
"text/plain": [
"metric threshold value idx\n",
"-------------------------- ----------- -------- -----\n",
"max f1 0.563288 0.72462 261\n",
"max f2 0.119724 0.85842 399\n",
"max f0point5 0.725001 0.671988 179\n",
"max accuracy 0.616361 0.644954 235\n",
"max precision 0.984071 0.844037 6\n",
"max absolute_MCC 0.694824 0.275787 194\n",
"max min_per_class_accuracy 0.743618 0.632117 167"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Scoring History:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"timestamp | \n",
"duration | \n",
"number_of_trees | \n",
"training_MSE | \n",
"training_logloss | \n",
"training_AUC | \n",
"training_classification_error | \n",
"validation_MSE | \n",
"validation_logloss | \n",
"validation_AUC | \n",
"validation_classification_error |
\n",
" | \n",
"2015-10-22 17:22:59 | \n",
" 0.093 sec | \n",
"1.0 | \n",
"0.3 | \n",
"7.3 | \n",
"0.6 | \n",
"0.4 | \n",
"0.3 | \n",
"7.9 | \n",
"0.6 | \n",
"0.5 |
\n",
" | \n",
"2015-10-22 17:22:59 | \n",
" 0.152 sec | \n",
"2.0 | \n",
"0.3 | \n",
"6.8 | \n",
"0.6 | \n",
"0.4 | \n",
"0.3 | \n",
"3.7 | \n",
"0.6 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 17:22:59 | \n",
" 0.210 sec | \n",
"3.0 | \n",
"0.3 | \n",
"5.9 | \n",
"0.6 | \n",
"0.4 | \n",
"0.3 | \n",
"2.2 | \n",
"0.6 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 17:22:59 | \n",
" 0.287 sec | \n",
"4.0 | \n",
"0.3 | \n",
"5.2 | \n",
"0.6 | \n",
"0.4 | \n",
"0.3 | \n",
"1.6 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 17:22:59 | \n",
" 0.377 sec | \n",
"5.0 | \n",
"0.3 | \n",
"4.3 | \n",
"0.7 | \n",
"0.4 | \n",
"0.3 | \n",
"1.3 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 17:22:59 | \n",
" 0.469 sec | \n",
"6.0 | \n",
"0.3 | \n",
"3.7 | \n",
"0.7 | \n",
"0.4 | \n",
"0.3 | \n",
"1.0 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 17:22:59 | \n",
" 0.571 sec | \n",
"7.0 | \n",
"0.3 | \n",
"3.2 | \n",
"0.7 | \n",
"0.4 | \n",
"0.3 | \n",
"0.9 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 17:22:59 | \n",
" 0.678 sec | \n",
"8.0 | \n",
"0.3 | \n",
"2.8 | \n",
"0.7 | \n",
"0.4 | \n",
"0.3 | \n",
"0.9 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 17:22:59 | \n",
" 0.784 sec | \n",
"9.0 | \n",
"0.3 | \n",
"2.4 | \n",
"0.7 | \n",
"0.4 | \n",
"0.2 | \n",
"0.8 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 17:22:59 | \n",
" 0.894 sec | \n",
"10.0 | \n",
"0.3 | \n",
"2.1 | \n",
"0.7 | \n",
"0.4 | \n",
"0.2 | \n",
"0.8 | \n",
"0.7 | \n",
"0.4 |
"
],
"text/plain": [
" timestamp duration number_of_trees training_MSE training_logloss training_AUC training_classification_error validation_MSE validation_logloss validation_AUC validation_classification_error\n",
"-- ------------------- ---------- ----------------- -------------- ------------------ -------------- ------------------------------- ---------------- -------------------- ---------------- ---------------------------------\n",
" 2015-10-22 17:22:59 0.093 sec 1 0.316234 7.2617 0.637354 0.391226 0.329137 7.87255 0.591265 0.45195\n",
" 2015-10-22 17:22:59 0.152 sec 2 0.31632 6.78219 0.637055 0.404918 0.287988 3.72898 0.623305 0.397467\n",
" 2015-10-22 17:22:59 0.210 sec 3 0.308719 5.88527 0.641048 0.41827 0.26868 2.16698 0.648628 0.405308\n",
" 2015-10-22 17:22:59 0.287 sec 4 0.305607 5.19153 0.641915 0.421194 0.263474 1.63783 0.656156 0.40953\n",
" 2015-10-22 17:22:59 0.377 sec 5 0.293851 4.31934 0.655025 0.410047 0.258099 1.26669 0.664182 0.385203\n",
" 2015-10-22 17:22:59 0.469 sec 6 0.286361 3.66732 0.662667 0.412603 0.254859 0.98441 0.670485 0.385806\n",
" 2015-10-22 17:22:59 0.571 sec 7 0.281092 3.23939 0.670316 0.406857 0.253256 0.94761 0.675664 0.382589\n",
" 2015-10-22 17:22:59 0.678 sec 8 0.276215 2.80048 0.676027 0.408681 0.252291 0.878195 0.678797 0.38842\n",
" 2015-10-22 17:22:59 0.784 sec 9 0.271224 2.40468 0.681395 0.390547 0.249776 0.825156 0.681276 0.372336\n",
" 2015-10-22 17:22:59 0.894 sec 10 0.268875 2.092 0.685292 0.388905 0.24981 0.770654 0.682375 0.360474"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Variable Importances:\n"
]
},
{
"data": {
"text/html": [
"variable | \n",
"relative_importance | \n",
"scaled_importance | \n",
"percentage |
\n",
"Origin | \n",
"6811.1 | \n",
"1.0 | \n",
"0.3 |
\n",
"fDayofMonth | \n",
"6129.0 | \n",
"0.9 | \n",
"0.3 |
\n",
"Dest | \n",
"4860.0 | \n",
"0.7 | \n",
"0.2 |
\n",
"UniqueCarrier | \n",
"1824.5 | \n",
"0.3 | \n",
"0.1 |
\n",
"fDayOfWeek | \n",
"1634.1 | \n",
"0.2 | \n",
"0.1 |
\n",
"Distance | \n",
"1591.5 | \n",
"0.2 | \n",
"0.1 |
\n",
"fMonth | \n",
"129.6 | \n",
"0.0 | \n",
"0.0 |
"
],
"text/plain": [
"variable relative_importance scaled_importance percentage\n",
"------------- --------------------- ------------------- ------------\n",
"Origin 6811.08 1 0.296394\n",
"fDayofMonth 6128.96 0.899851 0.266711\n",
"Dest 4860.05 0.71355 0.211492\n",
"UniqueCarrier 1824.52 0.267874 0.0793964\n",
"fDayOfWeek 1634.07 0.239913 0.0711088\n",
"Distance 1591.55 0.23367 0.0692584\n",
"fMonth 129.597 0.0190274 0.00563962"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"rf_bal = H2ORandomForestEstimator(seed=12, ntrees=10, max_depth=20, balance_classes=True)\n",
"rf_bal.train(x=myX, y=myY, training_frame=air_train, validation_frame=air_valid)\n",
"rf_bal.show()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Parse Progress: [##################################################] 100%\n",
"Imported /Users/ludirehak/h2o-3/smalldata/airlines/AirlinesTest.csv.zip. Parsed 2,691 rows and 12 cols\n"
]
}
],
"source": [
"air_test = h2o.import_file(path=_locate(\"smalldata/airlines/AirlinesTest.csv.zip\"))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def model(model_object, test):\n",
" #predicting on test file\n",
" pred = model_object.predict(test)\n",
" pred.head()\n",
" #Building confusion matrix for test set\n",
" perf = model_object.model_performance(test)\n",
" perf.show()\n",
" print(perf.confusion_matrix())\n",
" print(perf.precision())\n",
" print(perf.accuracy())\n",
" print(perf.auc())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"WITHOUT CLASS BALANCING\n",
"\n",
"H2OFrame with 2691 rows and 3 columns: \n"
]
},
{
"data": {
"text/html": [
"predict | \n",
"YES | \n",
"YES | \n",
"YES | \n",
"YES | \n",
"YES | \n",
"YES | \n",
"NO | \n",
"YES | \n",
"YES | \n",
"YES |
\n",
"NO | \n",
"0.1 | \n",
"0.0 | \n",
"0.225 | \n",
"0.175 | \n",
"0.5 | \n",
"0.4 | \n",
"0.6 | \n",
"0.3 | \n",
"0.3 | \n",
"0.4 |
\n",
"YES | \n",
"0.9 | \n",
"1.0 | \n",
"0.775 | \n",
"0.825 | \n",
"0.5 | \n",
"0.6 | \n",
"0.4 | \n",
"0.7 | \n",
"0.7 | \n",
"0.6 |
"
],
"text/plain": [
"predict YES YES YES YES YES YES NO YES YES YES\n",
"--------- ----- --------- ----- ----- -------- -------- -------- -------- -------- -------\n",
"NO 0.14 0.0242857 0.225 0.175 0.453293 0.388391 0.598466 0.271046 0.307406 0.42179\n",
"YES 0.86 0.975714 0.775 0.825 0.546707 0.611609 0.401534 0.728954 0.692594 0.57821"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"ModelMetricsBinomial: drf\n",
"** Reported on test data. **\n",
"\n",
"MSE: 0.242134967995\n",
"R^2: 0.0225448334417\n",
"LogLoss: 0.818660036508\n",
"AUC: 0.705312795104\n",
"Gini: 0.410625590208\n",
"\n",
"Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.51742125228:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"NO | \n",
"YES | \n",
"Error | \n",
"Rate |
\n",
"NO | \n",
"377.0 | \n",
"840.0 | \n",
"0.6902 | \n",
" (840.0/1217.0) |
\n",
"YES | \n",
"143.0 | \n",
"1331.0 | \n",
"0.097 | \n",
" (143.0/1474.0) |
\n",
"Total | \n",
"520.0 | \n",
"2171.0 | \n",
"0.3653 | \n",
" (983.0/2691.0) |
"
],
"text/plain": [
" NO YES Error Rate\n",
"----- ---- ----- ------- --------------\n",
"NO 377 840 0.6902 (840.0/1217.0)\n",
"YES 143 1331 0.097 (143.0/1474.0)\n",
"Total 520 2171 0.3653 (983.0/2691.0)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Maximum Metrics: Maximum metrics at their respective thresholds\n",
"\n"
]
},
{
"data": {
"text/html": [
"metric | \n",
"threshold | \n",
"value | \n",
"idx |
\n",
"max f1 | \n",
"0.5 | \n",
"0.7 | \n",
"276.0 |
\n",
"max f2 | \n",
"0.2 | \n",
"0.9 | \n",
"381.0 |
\n",
"max f0point5 | \n",
"0.7 | \n",
"0.7 | \n",
"174.0 |
\n",
"max accuracy | \n",
"0.7 | \n",
"0.7 | \n",
"186.0 |
\n",
"max precision | \n",
"1.0 | \n",
"0.9 | \n",
"7.0 |
\n",
"max absolute_MCC | \n",
"0.7 | \n",
"0.3 | \n",
"174.0 |
\n",
"max min_per_class_accuracy | \n",
"0.7 | \n",
"0.7 | \n",
"162.0 |
"
],
"text/plain": [
"metric threshold value idx\n",
"-------------------------- ----------- -------- -----\n",
"max f1 0.517421 0.730316 276\n",
"max f2 0.247669 0.859932 381\n",
"max f0point5 0.716854 0.693575 174\n",
"max accuracy 0.693919 0.66518 186\n",
"max precision 0.98545 0.85567 7\n",
"max absolute_MCC 0.716854 0.322241 174\n",
"max min_per_class_accuracy 0.737149 0.654003 162"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.51742125228:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"NO | \n",
"YES | \n",
"Error | \n",
"Rate |
\n",
"NO | \n",
"377.0 | \n",
"840.0 | \n",
"0.6902 | \n",
" (840.0/1217.0) |
\n",
"YES | \n",
"143.0 | \n",
"1331.0 | \n",
"0.097 | \n",
" (143.0/1474.0) |
\n",
"Total | \n",
"520.0 | \n",
"2171.0 | \n",
"0.3653 | \n",
" (983.0/2691.0) |
"
],
"text/plain": [
" NO YES Error Rate\n",
"----- ---- ----- ------- --------------\n",
"NO 377 840 0.6902 (840.0/1217.0)\n",
"YES 143 1331 0.097 (143.0/1474.0)\n",
"Total 520 2171 0.3653 (983.0/2691.0)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"[[0.985450211376883, 0.8556701030927835]]\n",
"[[0.6939187561627477, 0.6651802303976218]]\n",
"0.705312795104\n"
]
}
],
"source": [
"print(\"\\n\\nWITHOUT CLASS BALANCING\\n\")\n",
"model(rf_no_bal, air_test)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"WITH CLASS BALANCING\n",
"\n",
"H2OFrame with 2691 rows and 3 columns: \n"
]
},
{
"data": {
"text/html": [
"predict | \n",
"YES | \n",
"YES | \n",
"YES | \n",
"YES | \n",
"NO | \n",
"NO | \n",
"NO | \n",
"YES | \n",
"YES | \n",
"NO |
\n",
"NO | \n",
"0.0 | \n",
"0.3 | \n",
"0.1 | \n",
"0.0 | \n",
"0.4 | \n",
"0.5 | \n",
"0.7 | \n",
"0.1 | \n",
"0.3 | \n",
"0.5 |
\n",
"YES | \n",
"1.0 | \n",
"0.7 | \n",
"0.9 | \n",
"1.0 | \n",
"0.6 | \n",
"0.5 | \n",
"0.3 | \n",
"0.9 | \n",
"0.7 | \n",
"0.5 |
"
],
"text/plain": [
"predict YES YES YES YES NO NO NO YES YES NO\n",
"--------- --------- -------- --------- --------- -------- -------- -------- --------- -------- --------\n",
"NO 0.0116536 0.255432 0.0877282 0.0487275 0.447038 0.491235 0.669041 0.0670805 0.338563 0.484582\n",
"YES 0.988346 0.744568 0.912272 0.951272 0.552962 0.508765 0.330959 0.932919 0.661437 0.515418"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"ModelMetricsBinomial: drf\n",
"** Reported on test data. **\n",
"\n",
"MSE: 0.24831550935\n",
"R^2: -0.00240489657592\n",
"LogLoss: 0.758488823047\n",
"AUC: 0.693547371085\n",
"Gini: 0.38709474217\n",
"\n",
"Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.475092852495:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"NO | \n",
"YES | \n",
"Error | \n",
"Rate |
\n",
"NO | \n",
"269.0 | \n",
"948.0 | \n",
"0.779 | \n",
" (948.0/1217.0) |
\n",
"YES | \n",
"85.0 | \n",
"1389.0 | \n",
"0.0577 | \n",
" (85.0/1474.0) |
\n",
"Total | \n",
"354.0 | \n",
"2337.0 | \n",
"0.3839 | \n",
" (1033.0/2691.0) |
"
],
"text/plain": [
" NO YES Error Rate\n",
"----- ---- ----- ------- ---------------\n",
"NO 269 948 0.779 (948.0/1217.0)\n",
"YES 85 1389 0.0577 (85.0/1474.0)\n",
"Total 354 2337 0.3839 (1033.0/2691.0)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Maximum Metrics: Maximum metrics at their respective thresholds\n",
"\n"
]
},
{
"data": {
"text/html": [
"metric | \n",
"threshold | \n",
"value | \n",
"idx |
\n",
"max f1 | \n",
"0.5 | \n",
"0.7 | \n",
"307.0 |
\n",
"max f2 | \n",
"0.3 | \n",
"0.9 | \n",
"379.0 |
\n",
"max f0point5 | \n",
"0.7 | \n",
"0.7 | \n",
"184.0 |
\n",
"max accuracy | \n",
"0.7 | \n",
"0.7 | \n",
"210.0 |
\n",
"max precision | \n",
"1.0 | \n",
"0.85 | \n",
"1.0 |
\n",
"max absolute_MCC | \n",
"0.7 | \n",
"0.3 | \n",
"210.0 |
\n",
"max min_per_class_accuracy | \n",
"0.7 | \n",
"0.6 | \n",
"164.0 |
"
],
"text/plain": [
"metric threshold value idx\n",
"-------------------------- ----------- -------- -----\n",
"max f1 0.475093 0.728943 307\n",
"max f2 0.256539 0.859284 379\n",
"max f0point5 0.7144 0.6802 184\n",
"max accuracy 0.667305 0.654032 210\n",
"max precision 0.996238 0.85 1\n",
"max absolute_MCC 0.667305 0.29437 210\n",
"max min_per_class_accuracy 0.749175 0.637634 164"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.475092852495:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"NO | \n",
"YES | \n",
"Error | \n",
"Rate |
\n",
"NO | \n",
"269.0 | \n",
"948.0 | \n",
"0.779 | \n",
" (948.0/1217.0) |
\n",
"YES | \n",
"85.0 | \n",
"1389.0 | \n",
"0.0577 | \n",
" (85.0/1474.0) |
\n",
"Total | \n",
"354.0 | \n",
"2337.0 | \n",
"0.3839 | \n",
" (1033.0/2691.0) |
"
],
"text/plain": [
" NO YES Error Rate\n",
"----- ---- ----- ------- ---------------\n",
"NO 269 948 0.779 (948.0/1217.0)\n",
"YES 85 1389 0.0577 (85.0/1474.0)\n",
"Total 354 2337 0.3839 (1033.0/2691.0)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"[[0.9962384300103982, 0.85]]\n",
"[[0.6673053431289202, 0.6540319583797845]]\n",
"0.693547371085\n"
]
}
],
"source": [
"print(\"\\n\\nWITH CLASS BALANCING\\n\")\n",
"model(rf_bal, air_test)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.5"
}
},
"nbformat": 4,
"nbformat_minor": 0
}