{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import h2o\n",
"from h2o.estimators.gbm import H2OGradientBoostingEstimator\n",
"from h2o.estimators.glm import H2OGeneralizedLinearEstimator"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Warning: Version mismatch. H2O is version 3.5.0.99999, but the python package is version UNKNOWN.\n"
]
},
{
"data": {
"text/html": [
"
H2O cluster uptime: | \n",
"46 minutes 47 seconds 756 milliseconds |
\n",
"H2O cluster version: | \n",
"3.5.0.99999 |
\n",
"H2O cluster name: | \n",
"ludirehak |
\n",
"H2O cluster total nodes: | \n",
"1 |
\n",
"H2O cluster total memory: | \n",
"4.44 GB |
\n",
"H2O cluster total cores: | \n",
"8 |
\n",
"H2O cluster allowed cores: | \n",
"8 |
\n",
"H2O cluster healthy: | \n",
"True |
\n",
"H2O Connection ip: | \n",
"127.0.0.1 |
\n",
"H2O Connection port: | \n",
"54321 |
"
],
"text/plain": [
"-------------------------- --------------------------------------\n",
"H2O cluster uptime: 46 minutes 47 seconds 756 milliseconds\n",
"H2O cluster version: 3.5.0.99999\n",
"H2O cluster name: ludirehak\n",
"H2O cluster total nodes: 1\n",
"H2O cluster total memory: 4.44 GB\n",
"H2O cluster total cores: 8\n",
"H2O cluster allowed cores: 8\n",
"H2O cluster healthy: True\n",
"H2O Connection ip: 127.0.0.1\n",
"H2O Connection port: 54321\n",
"-------------------------- --------------------------------------"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"h2o.init()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Parse Progress: [##################################################] 100%\n",
"Imported /Users/ludirehak/h2o-3/smalldata/airlines/AirlinesTrain.csv.zip. Parsed 24,421 rows and 12 cols\n"
]
}
],
"source": [
"from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.\n",
"\n",
"#uploading data file to h2o\n",
"air = h2o.import_file(path=_locate(\"smalldata/airlines/AirlinesTrain.csv.zip\"))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Constructing validation and train sets by sampling (20/80)\n",
"# creating a column as tall as air.nrow\n",
"r = air[0].runif()\n",
"air_train = air[r < 0.8]\n",
"air_valid = air[r >= 0.8]\n",
"\n",
"myX = [\"Origin\", \"Dest\", \"Distance\", \"UniqueCarrier\", \"fMonth\", \"fDayofMonth\", \"fDayOfWeek\"]\n",
"myY = \"IsDepDelayed\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"gbm Model Build Progress: [##################################################] 100%\n",
"Model Details\n",
"=============\n",
"H2OGradientBoostingEstimator : Gradient Boosting Machine\n",
"Model Key: GBM_model_python_1445544453075_131\n",
"\n",
"Model Summary:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"number_of_trees | \n",
"model_size_in_bytes | \n",
"min_depth | \n",
"max_depth | \n",
"mean_depth | \n",
"min_leaves | \n",
"max_leaves | \n",
"mean_leaves |
\n",
" | \n",
"100.0 | \n",
"21616.0 | \n",
"3.0 | \n",
"3.0 | \n",
"3.0 | \n",
"8.0 | \n",
"8.0 | \n",
"8.0 |
"
],
"text/plain": [
" number_of_trees model_size_in_bytes min_depth max_depth mean_depth min_leaves max_leaves mean_leaves\n",
"-- ----------------- --------------------- ----------- ----------- ------------ ------------ ------------ -------------\n",
" 100 21616 3 3 3 8 8 8"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"ModelMetricsBinomial: gbm\n",
"** Reported on train data. **\n",
"\n",
"MSE: 0.225577653051\n",
"R^2: 0.0898968077725\n",
"LogLoss: 0.643152070892\n",
"AUC: 0.698999790699\n",
"Gini: 0.397999581398\n",
"\n",
"Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.449028022489:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"NO | \n",
"YES | \n",
"Error | \n",
"Rate |
\n",
"NO | \n",
"2837.0 | \n",
"6004.0 | \n",
"0.6791 | \n",
" (6004.0/8841.0) |
\n",
"YES | \n",
"1198.0 | \n",
"9446.0 | \n",
"0.1126 | \n",
" (1198.0/10644.0) |
\n",
"Total | \n",
"4035.0 | \n",
"15450.0 | \n",
"0.3696 | \n",
" (7202.0/19485.0) |
"
],
"text/plain": [
" NO YES Error Rate\n",
"----- ---- ----- ------- ----------------\n",
"NO 2837 6004 0.6791 (6004.0/8841.0)\n",
"YES 1198 9446 0.1126 (1198.0/10644.0)\n",
"Total 4035 15450 0.3696 (7202.0/19485.0)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Maximum Metrics: Maximum metrics at their respective thresholds\n",
"\n"
]
},
{
"data": {
"text/html": [
"metric | \n",
"threshold | \n",
"value | \n",
"idx |
\n",
"max f1 | \n",
"0.4 | \n",
"0.7 | \n",
"327.0 |
\n",
"max f2 | \n",
"0.4 | \n",
"0.9 | \n",
"396.0 |
\n",
"max f0point5 | \n",
"0.5 | \n",
"0.7 | \n",
"217.0 |
\n",
"max accuracy | \n",
"0.5 | \n",
"0.7 | \n",
"217.0 |
\n",
"max precision | \n",
"0.7 | \n",
"1.0 | \n",
"0.0 |
\n",
"max absolute_MCC | \n",
"0.5 | \n",
"0.3 | \n",
"217.0 |
\n",
"max min_per_class_accuracy | \n",
"0.5 | \n",
"0.6 | \n",
"199.0 |
"
],
"text/plain": [
"metric threshold value idx\n",
"-------------------------- ----------- -------- -----\n",
"max f1 0.449028 0.723998 327\n",
"max f2 0.391058 0.858331 396\n",
"max f0point5 0.536867 0.679513 217\n",
"max accuracy 0.536867 0.652091 217\n",
"max precision 0.70175 0.954545 0\n",
"max absolute_MCC 0.536867 0.2952 217\n",
"max min_per_class_accuracy 0.549643 0.645058 199"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"ModelMetricsBinomial: gbm\n",
"** Reported on validation data. **\n",
"\n",
"MSE: 0.226773773291\n",
"R^2: 0.0840250526986\n",
"LogLoss: 0.64567275652\n",
"AUC: 0.689332681253\n",
"Gini: 0.378665362506\n",
"\n",
"Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.447676388566:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"NO | \n",
"YES | \n",
"Error | \n",
"Rate |
\n",
"NO | \n",
"709.0 | \n",
"1516.0 | \n",
"0.6813 | \n",
" (1516.0/2225.0) |
\n",
"YES | \n",
"271.0 | \n",
"2440.0 | \n",
"0.1 | \n",
" (271.0/2711.0) |
\n",
"Total | \n",
"980.0 | \n",
"3956.0 | \n",
"0.362 | \n",
" (1787.0/4936.0) |
"
],
"text/plain": [
" NO YES Error Rate\n",
"----- ---- ----- ------- ---------------\n",
"NO 709 1516 0.6813 (1516.0/2225.0)\n",
"YES 271 2440 0.1 (271.0/2711.0)\n",
"Total 980 3956 0.362 (1787.0/4936.0)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Maximum Metrics: Maximum metrics at their respective thresholds\n",
"\n"
]
},
{
"data": {
"text/html": [
"metric | \n",
"threshold | \n",
"value | \n",
"idx |
\n",
"max f1 | \n",
"0.4 | \n",
"0.7 | \n",
"332.0 |
\n",
"max f2 | \n",
"0.4 | \n",
"0.9 | \n",
"390.0 |
\n",
"max f0point5 | \n",
"0.5 | \n",
"0.7 | \n",
"223.0 |
\n",
"max accuracy | \n",
"0.5 | \n",
"0.6 | \n",
"278.0 |
\n",
"max precision | \n",
"0.7 | \n",
"0.9 | \n",
"3.0 |
\n",
"max absolute_MCC | \n",
"0.5 | \n",
"0.3 | \n",
"223.0 |
\n",
"max min_per_class_accuracy | \n",
"0.5 | \n",
"0.6 | \n",
"205.0 |
"
],
"text/plain": [
"metric threshold value idx\n",
"-------------------------- ----------- -------- -----\n",
"max f1 0.447676 0.731963 332\n",
"max f2 0.397016 0.860572 390\n",
"max f0point5 0.536953 0.676097 223\n",
"max accuracy 0.488859 0.647488 278\n",
"max precision 0.698812 0.857143 3\n",
"max absolute_MCC 0.536953 0.282515 223\n",
"max min_per_class_accuracy 0.549863 0.637303 205"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Scoring History:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"timestamp | \n",
"duration | \n",
"number_of_trees | \n",
"training_MSE | \n",
"training_logloss | \n",
"training_AUC | \n",
"training_classification_error | \n",
"validation_MSE | \n",
"validation_logloss | \n",
"validation_AUC | \n",
"validation_classification_error |
\n",
" | \n",
"2015-10-22 13:54:21 | \n",
" 0.064 sec | \n",
"1.0 | \n",
"0.2 | \n",
"0.7 | \n",
"0.7 | \n",
"0.4 | \n",
"0.2 | \n",
"0.7 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 13:54:21 | \n",
" 0.105 sec | \n",
"2.0 | \n",
"0.2 | \n",
"0.7 | \n",
"0.7 | \n",
"0.4 | \n",
"0.2 | \n",
"0.7 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 13:54:21 | \n",
" 0.127 sec | \n",
"3.0 | \n",
"0.2 | \n",
"0.7 | \n",
"0.7 | \n",
"0.4 | \n",
"0.2 | \n",
"0.7 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 13:54:21 | \n",
" 0.148 sec | \n",
"4.0 | \n",
"0.2 | \n",
"0.7 | \n",
"0.7 | \n",
"0.4 | \n",
"0.2 | \n",
"0.7 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 13:54:21 | \n",
" 0.165 sec | \n",
"5.0 | \n",
"0.2 | \n",
"0.7 | \n",
"0.7 | \n",
"0.4 | \n",
"0.2 | \n",
"0.7 | \n",
"0.7 | \n",
"0.4 |
\n",
"--- | \n",
"--- | \n",
"--- | \n",
"--- | \n",
"--- | \n",
"--- | \n",
"--- | \n",
"--- | \n",
"--- | \n",
"--- | \n",
"--- | \n",
"--- |
\n",
" | \n",
"2015-10-22 13:54:25 | \n",
" 3.670 sec | \n",
"74.0 | \n",
"0.2 | \n",
"0.7 | \n",
"0.7 | \n",
"0.4 | \n",
"0.2 | \n",
"0.7 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 13:54:25 | \n",
" 3.759 sec | \n",
"75.0 | \n",
"0.2 | \n",
"0.6 | \n",
"0.7 | \n",
"0.4 | \n",
"0.2 | \n",
"0.7 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 13:54:25 | \n",
" 3.848 sec | \n",
"76.0 | \n",
"0.2 | \n",
"0.6 | \n",
"0.7 | \n",
"0.4 | \n",
"0.2 | \n",
"0.7 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 13:54:25 | \n",
" 3.942 sec | \n",
"77.0 | \n",
"0.2 | \n",
"0.6 | \n",
"0.7 | \n",
"0.4 | \n",
"0.2 | \n",
"0.7 | \n",
"0.7 | \n",
"0.4 |
\n",
" | \n",
"2015-10-22 13:54:25 | \n",
" 4.253 sec | \n",
"100.0 | \n",
"0.2 | \n",
"0.6 | \n",
"0.7 | \n",
"0.4 | \n",
"0.2 | \n",
"0.6 | \n",
"0.7 | \n",
"0.4 |
"
],
"text/plain": [
" timestamp duration number_of_trees training_MSE training_logloss training_AUC training_classification_error validation_MSE validation_logloss validation_AUC validation_classification_error\n",
"--- ------------------- ---------- ----------------- -------------- ------------------ -------------- ------------------------------- ---------------- -------------------- ---------------- ---------------------------------\n",
" 2015-10-22 13:54:21 0.064 sec 1.0 0.24744203641 0.688017967213 0.661146601782 0.394559917886 0.247187639079 0.687507942596 0.653644453102 0.387358184765\n",
" 2015-10-22 13:54:21 0.105 sec 2.0 0.247032960495 0.687192893145 0.661146601782 0.394559917886 0.246798272619 0.686722630143 0.653644453102 0.387358184765\n",
" 2015-10-22 13:54:21 0.127 sec 3.0 0.246641112878 0.686402635464 0.661191244068 0.394559917886 0.246413866245 0.685947386299 0.653625719603 0.387358184765\n",
" 2015-10-22 13:54:21 0.148 sec 4.0 0.246256837916 0.685627693478 0.66128086869 0.394559917886 0.246037095293 0.685187587881 0.653522768248 0.387358184765\n",
" 2015-10-22 13:54:21 0.165 sec 5.0 0.245880433609 0.684868650345 0.661264865052 0.394559917886 0.245668101129 0.684443502432 0.653487622213 0.387358184765\n",
"--- --- --- --- --- --- --- --- --- --- --- ---\n",
" 2015-10-22 13:54:25 3.670 sec 74.0 0.228843104496 0.650073267981 0.695174745911 0.383525789069 0.229526045213 0.651499695631 0.68725898234 0.360818476499\n",
" 2015-10-22 13:54:25 3.759 sec 75.0 0.228712827672 0.64980226979 0.695338065904 0.384860148832 0.229406690649 0.651250869305 0.687490332768 0.360818476499\n",
" 2015-10-22 13:54:25 3.848 sec 76.0 0.228574240144 0.649512720177 0.695468114059 0.385578650244 0.229293238024 0.651013099537 0.687698639335 0.360615883306\n",
" 2015-10-22 13:54:25 3.942 sec 77.0 0.228432084788 0.649211690345 0.69570165453 0.385322042597 0.229175189133 0.650762798143 0.687752021519 0.361021069692\n",
" 2015-10-22 13:54:25 4.253 sec 100.0 0.225577653051 0.643152070892 0.698999790699 0.369617654606 0.226773773291 0.64567275652 0.689332681253 0.362034035656"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Variable Importances:\n"
]
},
{
"data": {
"text/html": [
"variable | \n",
"relative_importance | \n",
"scaled_importance | \n",
"percentage |
\n",
"Origin | \n",
"16932.9 | \n",
"1.0 | \n",
"0.7 |
\n",
"Dest | \n",
"4282.9 | \n",
"0.3 | \n",
"0.2 |
\n",
"UniqueCarrier | \n",
"1623.9 | \n",
"0.1 | \n",
"0.1 |
\n",
"fDayofMonth | \n",
"1457.2 | \n",
"0.1 | \n",
"0.1 |
\n",
"fDayOfWeek | \n",
"95.7 | \n",
"0.0 | \n",
"0.0 |
\n",
"fMonth | \n",
"67.4 | \n",
"0.0 | \n",
"0.0 |
\n",
"Distance | \n",
"0.0 | \n",
"0.0 | \n",
"0.0 |
"
],
"text/plain": [
"variable relative_importance scaled_importance percentage\n",
"------------- --------------------- ------------------- ------------\n",
"Origin 16932.9 1 0.692269\n",
"Dest 4282.91 0.252935 0.175099\n",
"UniqueCarrier 1623.88 0.0959011 0.0663893\n",
"fDayofMonth 1457.19 0.0860569 0.0595745\n",
"fDayOfWeek 95.7408 0.00565414 0.00391419\n",
"fMonth 67.379 0.00397918 0.00275466\n",
"Distance 0 0 0"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"#gbm\n",
"gbm = H2OGradientBoostingEstimator(distribution=\"bernoulli\", \n",
" ntrees=100, \n",
" max_depth=3, \n",
" learn_rate=0.01)\n",
"gbm.train(x =myX, \n",
" y =myY, \n",
" training_frame =air_train,\n",
" validation_frame=air_valid)\n",
"gbm.show()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"glm Model Build Progress: [##################################################] 100%\n",
"\n",
"Coefficients: glm coefficients\n",
"\n"
]
},
{
"data": {
"text/html": [
"names | \n",
"coefficients | \n",
"standardized_coefficients |
\n",
"Intercept | \n",
"0.1 | \n",
"0.2 |
\n",
"Origin.ABE | \n",
"-0.0 | \n",
"-0.0 |
\n",
"Origin.ABQ | \n",
"-0.0 | \n",
"-0.0 |
\n",
"Origin.ACY | \n",
"-0.0 | \n",
"-0.0 |
\n",
"Origin.ALB | \n",
"0.0 | \n",
"0.0 |
\n",
"--- | \n",
"--- | \n",
"--- |
\n",
"fDayOfWeek.f6 | \n",
"-0.1 | \n",
"-0.1 |
\n",
"fDayOfWeek.f7 | \n",
"0.0 | \n",
"0.0 |
\n",
"fMonth.f1 | \n",
"-0.1 | \n",
"-0.1 |
\n",
"fMonth.f10 | \n",
"0.1 | \n",
"0.1 |
\n",
"Distance | \n",
"0.0 | \n",
"0.1 |
"
],
"text/plain": [
"names coefficients standardized_coefficients\n",
"------------- ----------------- ---------------------------\n",
"Intercept 0.0663095803818 0.215820066602\n",
"Origin.ABE -0.00583359882932 -0.00583359882932\n",
"Origin.ABQ -0.0368111369703 -0.0368111369703\n",
"Origin.ACY -0.013059173517 -0.013059173517\n",
"Origin.ALB 0.00723780457571 0.00723780457571\n",
"--- --- ---\n",
"fDayOfWeek.f6 -0.0870834614368 -0.0870834614368\n",
"fDayOfWeek.f7 0.022439958678 0.022439958678\n",
"fMonth.f1 -0.0942343897579 -0.0942343897579\n",
"fMonth.f10 0.100615936561 0.100615936561\n",
"Distance 0.000198390708934 0.12506622695"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"#glm\n",
"glm = H2OGeneralizedLinearEstimator(family = \"binomial\", solver=\"L_BFGS\")\n",
"glm.train(x =myX, \n",
" y =myY, \n",
" training_frame =air_train,\n",
" validation_frame=air_valid)\n",
"glm.pprint_coef()\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Parse Progress: [##################################################] 100%\n",
"Imported /Users/ludirehak/h2o-3/smalldata/airlines/AirlinesTest.csv.zip. Parsed 2,691 rows and 12 cols\n"
]
}
],
"source": [
"#uploading test file to h2o\n",
"air_test = h2o.import_file(path=_locate(\"smalldata/airlines/AirlinesTest.csv.zip\"))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GBM predictions: \n",
"H2OFrame with 2691 rows and 3 columns: \n"
]
},
{
"data": {
"text/html": [
"predict | \n",
"YES | \n",
"YES | \n",
"YES | \n",
"YES | \n",
"YES | \n",
"YES | \n",
"NO | \n",
"NO | \n",
"NO | \n",
"NO |
\n",
"NO | \n",
"0.5 | \n",
"0.5 | \n",
"0.5 | \n",
"0.4 | \n",
"0.5 | \n",
"0.5 | \n",
"0.6 | \n",
"0.6 | \n",
"0.6 | \n",
"0.6 |
\n",
"YES | \n",
"0.5 | \n",
"0.5 | \n",
"0.5 | \n",
"0.6 | \n",
"0.5 | \n",
"0.5 | \n",
"0.4 | \n",
"0.4 | \n",
"0.4 | \n",
"0.4 |
"
],
"text/plain": [
"predict YES YES YES YES YES YES NO NO NO NO\n",
"--------- -------- -------- -------- -------- -------- -------- ------- -------- -------- --------\n",
"NO 0.491774 0.492421 0.495248 0.395015 0.543697 0.542794 0.55786 0.560893 0.566712 0.566712\n",
"YES 0.508226 0.507579 0.504752 0.604985 0.456303 0.457206 0.44214 0.439107 0.433288 0.433288"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"GBM performance: \n",
"\n",
"ModelMetricsBinomial: gbm\n",
"** Reported on test data. **\n",
"\n",
"MSE: 0.226299117103\n",
"R^2: 0.086471305524\n",
"LogLoss: 0.644721964315\n",
"AUC: 0.693439503015\n",
"Gini: 0.386879006031\n",
"\n",
"Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.446332927983:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"NO | \n",
"YES | \n",
"Error | \n",
"Rate |
\n",
"NO | \n",
"345.0 | \n",
"872.0 | \n",
"0.7165 | \n",
" (872.0/1217.0) |
\n",
"YES | \n",
"136.0 | \n",
"1338.0 | \n",
"0.0923 | \n",
" (136.0/1474.0) |
\n",
"Total | \n",
"481.0 | \n",
"2210.0 | \n",
"0.3746 | \n",
" (1008.0/2691.0) |
"
],
"text/plain": [
" NO YES Error Rate\n",
"----- ---- ----- ------- ---------------\n",
"NO 345 872 0.7165 (872.0/1217.0)\n",
"YES 136 1338 0.0923 (136.0/1474.0)\n",
"Total 481 2210 0.3746 (1008.0/2691.0)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Maximum Metrics: Maximum metrics at their respective thresholds\n",
"\n"
]
},
{
"data": {
"text/html": [
"metric | \n",
"threshold | \n",
"value | \n",
"idx |
\n",
"max f1 | \n",
"0.4 | \n",
"0.7 | \n",
"335.0 |
\n",
"max f2 | \n",
"0.4 | \n",
"0.9 | \n",
"391.0 |
\n",
"max f0point5 | \n",
"0.5 | \n",
"0.7 | \n",
"220.0 |
\n",
"max accuracy | \n",
"0.5 | \n",
"0.7 | \n",
"225.0 |
\n",
"max precision | \n",
"0.7 | \n",
"1.0 | \n",
"0.0 |
\n",
"max absolute_MCC | \n",
"0.5 | \n",
"0.3 | \n",
"225.0 |
\n",
"max min_per_class_accuracy | \n",
"0.6 | \n",
"0.6 | \n",
"203.0 |
"
],
"text/plain": [
"metric threshold value idx\n",
"-------------------------- ----------- -------- -----\n",
"max f1 0.446333 0.726384 335\n",
"max f2 0.397676 0.859137 391\n",
"max f0point5 0.538884 0.685789 220\n",
"max accuracy 0.535158 0.659606 225\n",
"max precision 0.70175 1 0\n",
"max absolute_MCC 0.535158 0.308497 225\n",
"max min_per_class_accuracy 0.553579 0.639756 203"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"GLM predictions: \n",
"H2OFrame with 2691 rows and 3 columns: \n"
]
},
{
"data": {
"text/html": [
"predict | \n",
"YES | \n",
"YES | \n",
"YES | \n",
"YES | \n",
"YES | \n",
"YES | \n",
"YES | \n",
"YES | \n",
"YES | \n",
"YES |
\n",
"p0 | \n",
"0.4 | \n",
"0.4 | \n",
"0.4 | \n",
"0.5 | \n",
"0.5 | \n",
"0.5 | \n",
"0.5 | \n",
"0.4 | \n",
"0.5 | \n",
"0.5 |
\n",
"p1 | \n",
"0.6 | \n",
"0.6 | \n",
"0.6 | \n",
"0.5 | \n",
"0.5 | \n",
"0.5 | \n",
"0.5 | \n",
"0.6 | \n",
"0.5 | \n",
"0.5 |
"
],
"text/plain": [
"predict YES YES YES YES YES YES YES YES YES YES\n",
"--------- -------- -------- -------- -------- -------- -------- -------- -------- -------- --------\n",
"p0 0.408076 0.434986 0.413786 0.462743 0.503313 0.506724 0.465098 0.438877 0.489413 0.487458\n",
"p1 0.591924 0.565014 0.586214 0.537257 0.496687 0.493276 0.534902 0.561123 0.510587 0.512542"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"GLM performance: \n",
"\n",
"ModelMetricsBinomialGLM: glm\n",
"** Reported on test data. **\n",
"\n",
"MSE: 0.232028999965\n",
"R^2: 0.0633408025091\n",
"LogLoss: 0.656433714264\n",
"Null degrees of freedom: 2690\n",
"Residual degrees of freedom: 2438\n",
"Null deviance: 3705.96023003\n",
"Residual deviance: 3532.92625017\n",
"AIC: 4038.92625017\n",
"AUC: 0.656781919193\n",
"Gini: 0.313563838386\n",
"\n",
"Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.459695105317:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"NO | \n",
"YES | \n",
"Error | \n",
"Rate |
\n",
"NO | \n",
"280.0 | \n",
"937.0 | \n",
"0.7699 | \n",
" (937.0/1217.0) |
\n",
"YES | \n",
"106.0 | \n",
"1368.0 | \n",
"0.0719 | \n",
" (106.0/1474.0) |
\n",
"Total | \n",
"386.0 | \n",
"2305.0 | \n",
"0.3876 | \n",
" (1043.0/2691.0) |
"
],
"text/plain": [
" NO YES Error Rate\n",
"----- ---- ----- ------- ---------------\n",
"NO 280 937 0.7699 (937.0/1217.0)\n",
"YES 106 1368 0.0719 (106.0/1474.0)\n",
"Total 386 2305 0.3876 (1043.0/2691.0)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Maximum Metrics: Maximum metrics at their respective thresholds\n",
"\n"
]
},
{
"data": {
"text/html": [
"metric | \n",
"threshold | \n",
"value | \n",
"idx |
\n",
"max f1 | \n",
"0.5 | \n",
"0.7 | \n",
"309.0 |
\n",
"max f2 | \n",
"0.4 | \n",
"0.9 | \n",
"391.0 |
\n",
"max f0point5 | \n",
"0.5 | \n",
"0.7 | \n",
"256.0 |
\n",
"max accuracy | \n",
"0.5 | \n",
"0.6 | \n",
"256.0 |
\n",
"max precision | \n",
"0.8 | \n",
"1.0 | \n",
"0.0 |
\n",
"max absolute_MCC | \n",
"0.5 | \n",
"0.2 | \n",
"257.0 |
\n",
"max min_per_class_accuracy | \n",
"0.6 | \n",
"0.6 | \n",
"192.0 |
"
],
"text/plain": [
"metric threshold value idx\n",
"-------------------------- ----------- -------- -----\n",
"max f1 0.459695 0.724001 309\n",
"max f2 0.352918 0.859494 391\n",
"max f0point5 0.512906 0.655381 256\n",
"max accuracy 0.512906 0.631735 256\n",
"max precision 0.766709 1 0\n",
"max absolute_MCC 0.512335 0.249485 257\n",
"max min_per_class_accuracy 0.557716 0.604478 192"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# predicting & performance on test file\n",
"gbm_pred = gbm.predict(air_test)\n",
"print(\"GBM predictions: \")\n",
"gbm_pred.head()\n",
"\n",
"gbm_perf = gbm.model_performance(air_test)\n",
"print(\"GBM performance: \")\n",
"gbm_perf.show()\n",
"\n",
"glm_pred = glm.predict(air_test)\n",
"print(\"GLM predictions: \")\n",
"glm_pred.head()\n",
"\n",
"glm_perf = glm.model_performance(air_test)\n",
"print(\"GLM performance: \")\n",
"glm_perf.show()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.446332927983:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"NO | \n",
"YES | \n",
"Error | \n",
"Rate |
\n",
"NO | \n",
"345.0 | \n",
"872.0 | \n",
"0.7165 | \n",
" (872.0/1217.0) |
\n",
"YES | \n",
"136.0 | \n",
"1338.0 | \n",
"0.0923 | \n",
" (136.0/1474.0) |
\n",
"Total | \n",
"481.0 | \n",
"2210.0 | \n",
"0.3746 | \n",
" (1008.0/2691.0) |
"
],
"text/plain": [
" NO YES Error Rate\n",
"----- ---- ----- ------- ---------------\n",
"NO 345 872 0.7165 (872.0/1217.0)\n",
"YES 136 1338 0.0923 (136.0/1474.0)\n",
"Total 481 2210 0.3746 (1008.0/2691.0)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"\n",
"Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.459695105317:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"NO | \n",
"YES | \n",
"Error | \n",
"Rate |
\n",
"NO | \n",
"280.0 | \n",
"937.0 | \n",
"0.7699 | \n",
" (937.0/1217.0) |
\n",
"YES | \n",
"106.0 | \n",
"1368.0 | \n",
"0.0719 | \n",
" (106.0/1474.0) |
\n",
"Total | \n",
"386.0 | \n",
"2305.0 | \n",
"0.3876 | \n",
" (1043.0/2691.0) |
"
],
"text/plain": [
" NO YES Error Rate\n",
"----- ---- ----- ------- ---------------\n",
"NO 280 937 0.7699 (937.0/1217.0)\n",
"YES 106 1368 0.0719 (106.0/1474.0)\n",
"Total 386 2305 0.3876 (1043.0/2691.0)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"# Building confusion matrix for test set\n",
"gbm_CM = gbm_perf.confusion_matrix()\n",
"print(gbm_CM)\n",
"print\n",
"\n",
"glm_CM = glm_perf.confusion_matrix()\n",
"print(glm_CM)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GBM Precision: [[0.7017496139979671, 1.0]]\n",
"GBM Accuracy: [[0.5351575413437655, 0.6596060943887031]]\n",
"GBM AUC: 0.693439503015\n",
"\n",
"GLM Precision: [[0.7667089295101112, 1.0]]\n",
"GLM Accuracy: [[0.512905531794376, 0.63173541434411]]\n",
"GLM AUC: 0.656781919193\n"
]
}
],
"source": [
"# ROC for test set\n",
"print('GBM Precision: {0}'.format(gbm_perf.precision()))\n",
"print('GBM Accuracy: {0}'.format(gbm_perf.accuracy()))\n",
"print('GBM AUC: {0}'.format(gbm_perf.auc()))\n",
"print\n",
"print('GLM Precision: {0}'.format(glm_perf.precision()))\n",
"print('GLM Accuracy: {0}'.format(glm_perf.accuracy()))\n",
"print('GLM AUC: {0}'.format(glm_perf.auc()))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 0
}