{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import h2o\n", "from h2o.estimators.gbm import H2OGradientBoostingEstimator\n", "from h2o.estimators.glm import H2OGeneralizedLinearEstimator" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Warning: Version mismatch. H2O is version 3.5.0.99999, but the python package is version UNKNOWN.\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
H2O cluster uptime: 46 minutes 47 seconds 756 milliseconds
H2O cluster version: 3.5.0.99999
H2O cluster name: ludirehak
H2O cluster total nodes: 1
H2O cluster total memory: 4.44 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: True
H2O Connection ip: 127.0.0.1
H2O Connection port: 54321
" ], "text/plain": [ "-------------------------- --------------------------------------\n", "H2O cluster uptime: 46 minutes 47 seconds 756 milliseconds\n", "H2O cluster version: 3.5.0.99999\n", "H2O cluster name: ludirehak\n", "H2O cluster total nodes: 1\n", "H2O cluster total memory: 4.44 GB\n", "H2O cluster total cores: 8\n", "H2O cluster allowed cores: 8\n", "H2O cluster healthy: True\n", "H2O Connection ip: 127.0.0.1\n", "H2O Connection port: 54321\n", "-------------------------- --------------------------------------" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "h2o.init()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Parse Progress: [##################################################] 100%\n", "Imported /Users/ludirehak/h2o-3/smalldata/airlines/AirlinesTrain.csv.zip. Parsed 24,421 rows and 12 cols\n" ] } ], "source": [ "from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.\n", "\n", "#uploading data file to h2o\n", "air = h2o.import_file(path=_locate(\"smalldata/airlines/AirlinesTrain.csv.zip\"))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Constructing validation and train sets by sampling (20/80)\n", "# creating a column as tall as air.nrow\n", "r = air[0].runif()\n", "air_train = air[r < 0.8]\n", "air_valid = air[r >= 0.8]\n", "\n", "myX = [\"Origin\", \"Dest\", \"Distance\", \"UniqueCarrier\", \"fMonth\", \"fDayofMonth\", \"fDayOfWeek\"]\n", "myY = \"IsDepDelayed\"" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "gbm Model Build Progress: [##################################################] 100%\n", "Model Details\n", "=============\n", "H2OGradientBoostingEstimator : Gradient Boosting Machine\n", "Model Key: GBM_model_python_1445544453075_131\n", "\n", "Model Summary:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
number_of_treesmodel_size_in_bytesmin_depthmax_depthmean_depthmin_leavesmax_leavesmean_leaves
100.021616.03.03.03.08.08.08.0
" ], "text/plain": [ " number_of_trees model_size_in_bytes min_depth max_depth mean_depth min_leaves max_leaves mean_leaves\n", "-- ----------------- --------------------- ----------- ----------- ------------ ------------ ------------ -------------\n", " 100 21616 3 3 3 8 8 8" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "ModelMetricsBinomial: gbm\n", "** Reported on train data. **\n", "\n", "MSE: 0.225577653051\n", "R^2: 0.0898968077725\n", "LogLoss: 0.643152070892\n", "AUC: 0.698999790699\n", "Gini: 0.397999581398\n", "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.449028022489:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
NOYESErrorRate
NO2837.06004.00.6791 (6004.0/8841.0)
YES1198.09446.00.1126 (1198.0/10644.0)
Total4035.015450.00.3696 (7202.0/19485.0)
" ], "text/plain": [ " NO YES Error Rate\n", "----- ---- ----- ------- ----------------\n", "NO 2837 6004 0.6791 (6004.0/8841.0)\n", "YES 1198 9446 0.1126 (1198.0/10644.0)\n", "Total 4035 15450 0.3696 (7202.0/19485.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Maximum Metrics: Maximum metrics at their respective thresholds\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
metricthresholdvalueidx
max f10.40.7327.0
max f20.40.9396.0
max f0point50.50.7217.0
max accuracy0.50.7217.0
max precision0.71.00.0
max absolute_MCC0.50.3217.0
max min_per_class_accuracy0.50.6199.0
" ], "text/plain": [ "metric threshold value idx\n", "-------------------------- ----------- -------- -----\n", "max f1 0.449028 0.723998 327\n", "max f2 0.391058 0.858331 396\n", "max f0point5 0.536867 0.679513 217\n", "max accuracy 0.536867 0.652091 217\n", "max precision 0.70175 0.954545 0\n", "max absolute_MCC 0.536867 0.2952 217\n", "max min_per_class_accuracy 0.549643 0.645058 199" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "ModelMetricsBinomial: gbm\n", "** Reported on validation data. **\n", "\n", "MSE: 0.226773773291\n", "R^2: 0.0840250526986\n", "LogLoss: 0.64567275652\n", "AUC: 0.689332681253\n", "Gini: 0.378665362506\n", "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.447676388566:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
NOYESErrorRate
NO709.01516.00.6813 (1516.0/2225.0)
YES271.02440.00.1 (271.0/2711.0)
Total980.03956.00.362 (1787.0/4936.0)
" ], "text/plain": [ " NO YES Error Rate\n", "----- ---- ----- ------- ---------------\n", "NO 709 1516 0.6813 (1516.0/2225.0)\n", "YES 271 2440 0.1 (271.0/2711.0)\n", "Total 980 3956 0.362 (1787.0/4936.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Maximum Metrics: Maximum metrics at their respective thresholds\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
metricthresholdvalueidx
max f10.40.7332.0
max f20.40.9390.0
max f0point50.50.7223.0
max accuracy0.50.6278.0
max precision0.70.93.0
max absolute_MCC0.50.3223.0
max min_per_class_accuracy0.50.6205.0
" ], "text/plain": [ "metric threshold value idx\n", "-------------------------- ----------- -------- -----\n", "max f1 0.447676 0.731963 332\n", "max f2 0.397016 0.860572 390\n", "max f0point5 0.536953 0.676097 223\n", "max accuracy 0.488859 0.647488 278\n", "max precision 0.698812 0.857143 3\n", "max absolute_MCC 0.536953 0.282515 223\n", "max min_per_class_accuracy 0.549863 0.637303 205" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Scoring History:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
timestampdurationnumber_of_treestraining_MSEtraining_loglosstraining_AUCtraining_classification_errorvalidation_MSEvalidation_loglossvalidation_AUCvalidation_classification_error
2015-10-22 13:54:21 0.064 sec1.00.20.70.70.40.20.70.70.4
2015-10-22 13:54:21 0.105 sec2.00.20.70.70.40.20.70.70.4
2015-10-22 13:54:21 0.127 sec3.00.20.70.70.40.20.70.70.4
2015-10-22 13:54:21 0.148 sec4.00.20.70.70.40.20.70.70.4
2015-10-22 13:54:21 0.165 sec5.00.20.70.70.40.20.70.70.4
------------------------------------
2015-10-22 13:54:25 3.670 sec74.00.20.70.70.40.20.70.70.4
2015-10-22 13:54:25 3.759 sec75.00.20.60.70.40.20.70.70.4
2015-10-22 13:54:25 3.848 sec76.00.20.60.70.40.20.70.70.4
2015-10-22 13:54:25 3.942 sec77.00.20.60.70.40.20.70.70.4
2015-10-22 13:54:25 4.253 sec100.00.20.60.70.40.20.60.70.4
" ], "text/plain": [ " timestamp duration number_of_trees training_MSE training_logloss training_AUC training_classification_error validation_MSE validation_logloss validation_AUC validation_classification_error\n", "--- ------------------- ---------- ----------------- -------------- ------------------ -------------- ------------------------------- ---------------- -------------------- ---------------- ---------------------------------\n", " 2015-10-22 13:54:21 0.064 sec 1.0 0.24744203641 0.688017967213 0.661146601782 0.394559917886 0.247187639079 0.687507942596 0.653644453102 0.387358184765\n", " 2015-10-22 13:54:21 0.105 sec 2.0 0.247032960495 0.687192893145 0.661146601782 0.394559917886 0.246798272619 0.686722630143 0.653644453102 0.387358184765\n", " 2015-10-22 13:54:21 0.127 sec 3.0 0.246641112878 0.686402635464 0.661191244068 0.394559917886 0.246413866245 0.685947386299 0.653625719603 0.387358184765\n", " 2015-10-22 13:54:21 0.148 sec 4.0 0.246256837916 0.685627693478 0.66128086869 0.394559917886 0.246037095293 0.685187587881 0.653522768248 0.387358184765\n", " 2015-10-22 13:54:21 0.165 sec 5.0 0.245880433609 0.684868650345 0.661264865052 0.394559917886 0.245668101129 0.684443502432 0.653487622213 0.387358184765\n", "--- --- --- --- --- --- --- --- --- --- --- ---\n", " 2015-10-22 13:54:25 3.670 sec 74.0 0.228843104496 0.650073267981 0.695174745911 0.383525789069 0.229526045213 0.651499695631 0.68725898234 0.360818476499\n", " 2015-10-22 13:54:25 3.759 sec 75.0 0.228712827672 0.64980226979 0.695338065904 0.384860148832 0.229406690649 0.651250869305 0.687490332768 0.360818476499\n", " 2015-10-22 13:54:25 3.848 sec 76.0 0.228574240144 0.649512720177 0.695468114059 0.385578650244 0.229293238024 0.651013099537 0.687698639335 0.360615883306\n", " 2015-10-22 13:54:25 3.942 sec 77.0 0.228432084788 0.649211690345 0.69570165453 0.385322042597 0.229175189133 0.650762798143 0.687752021519 0.361021069692\n", " 2015-10-22 13:54:25 4.253 sec 100.0 0.225577653051 0.643152070892 0.698999790699 0.369617654606 0.226773773291 0.64567275652 0.689332681253 0.362034035656" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Variable Importances:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
variablerelative_importancescaled_importancepercentage
Origin16932.91.00.7
Dest4282.90.30.2
UniqueCarrier1623.90.10.1
fDayofMonth1457.20.10.1
fDayOfWeek95.70.00.0
fMonth67.40.00.0
Distance0.00.00.0
" ], "text/plain": [ "variable relative_importance scaled_importance percentage\n", "------------- --------------------- ------------------- ------------\n", "Origin 16932.9 1 0.692269\n", "Dest 4282.91 0.252935 0.175099\n", "UniqueCarrier 1623.88 0.0959011 0.0663893\n", "fDayofMonth 1457.19 0.0860569 0.0595745\n", "fDayOfWeek 95.7408 0.00565414 0.00391419\n", "fMonth 67.379 0.00397918 0.00275466\n", "Distance 0 0 0" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#gbm\n", "gbm = H2OGradientBoostingEstimator(distribution=\"bernoulli\", \n", " ntrees=100, \n", " max_depth=3, \n", " learn_rate=0.01)\n", "gbm.train(x =myX, \n", " y =myY, \n", " training_frame =air_train,\n", " validation_frame=air_valid)\n", "gbm.show()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "glm Model Build Progress: [##################################################] 100%\n", "\n", "Coefficients: glm coefficients\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
namescoefficientsstandardized_coefficients
Intercept0.10.2
Origin.ABE-0.0-0.0
Origin.ABQ-0.0-0.0
Origin.ACY-0.0-0.0
Origin.ALB0.00.0
---------
fDayOfWeek.f6-0.1-0.1
fDayOfWeek.f70.00.0
fMonth.f1-0.1-0.1
fMonth.f100.10.1
Distance0.00.1
" ], "text/plain": [ "names coefficients standardized_coefficients\n", "------------- ----------------- ---------------------------\n", "Intercept 0.0663095803818 0.215820066602\n", "Origin.ABE -0.00583359882932 -0.00583359882932\n", "Origin.ABQ -0.0368111369703 -0.0368111369703\n", "Origin.ACY -0.013059173517 -0.013059173517\n", "Origin.ALB 0.00723780457571 0.00723780457571\n", "--- --- ---\n", "fDayOfWeek.f6 -0.0870834614368 -0.0870834614368\n", "fDayOfWeek.f7 0.022439958678 0.022439958678\n", "fMonth.f1 -0.0942343897579 -0.0942343897579\n", "fMonth.f10 0.100615936561 0.100615936561\n", "Distance 0.000198390708934 0.12506622695" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "#glm\n", "glm = H2OGeneralizedLinearEstimator(family = \"binomial\", solver=\"L_BFGS\")\n", "glm.train(x =myX, \n", " y =myY, \n", " training_frame =air_train,\n", " validation_frame=air_valid)\n", "glm.pprint_coef()\n", " " ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Parse Progress: [##################################################] 100%\n", "Imported /Users/ludirehak/h2o-3/smalldata/airlines/AirlinesTest.csv.zip. Parsed 2,691 rows and 12 cols\n" ] } ], "source": [ "#uploading test file to h2o\n", "air_test = h2o.import_file(path=_locate(\"smalldata/airlines/AirlinesTest.csv.zip\"))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "GBM predictions: \n", "H2OFrame with 2691 rows and 3 columns: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
predictYESYESYESYESYESYESNONONONO
NO0.50.50.50.40.50.50.60.60.60.6
YES0.50.50.50.60.50.50.40.40.40.4
" ], "text/plain": [ "predict YES YES YES YES YES YES NO NO NO NO\n", "--------- -------- -------- -------- -------- -------- -------- ------- -------- -------- --------\n", "NO 0.491774 0.492421 0.495248 0.395015 0.543697 0.542794 0.55786 0.560893 0.566712 0.566712\n", "YES 0.508226 0.507579 0.504752 0.604985 0.456303 0.457206 0.44214 0.439107 0.433288 0.433288" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "GBM performance: \n", "\n", "ModelMetricsBinomial: gbm\n", "** Reported on test data. **\n", "\n", "MSE: 0.226299117103\n", "R^2: 0.086471305524\n", "LogLoss: 0.644721964315\n", "AUC: 0.693439503015\n", "Gini: 0.386879006031\n", "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.446332927983:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
NOYESErrorRate
NO345.0872.00.7165 (872.0/1217.0)
YES136.01338.00.0923 (136.0/1474.0)
Total481.02210.00.3746 (1008.0/2691.0)
" ], "text/plain": [ " NO YES Error Rate\n", "----- ---- ----- ------- ---------------\n", "NO 345 872 0.7165 (872.0/1217.0)\n", "YES 136 1338 0.0923 (136.0/1474.0)\n", "Total 481 2210 0.3746 (1008.0/2691.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Maximum Metrics: Maximum metrics at their respective thresholds\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
metricthresholdvalueidx
max f10.40.7335.0
max f20.40.9391.0
max f0point50.50.7220.0
max accuracy0.50.7225.0
max precision0.71.00.0
max absolute_MCC0.50.3225.0
max min_per_class_accuracy0.60.6203.0
" ], "text/plain": [ "metric threshold value idx\n", "-------------------------- ----------- -------- -----\n", "max f1 0.446333 0.726384 335\n", "max f2 0.397676 0.859137 391\n", "max f0point5 0.538884 0.685789 220\n", "max accuracy 0.535158 0.659606 225\n", "max precision 0.70175 1 0\n", "max absolute_MCC 0.535158 0.308497 225\n", "max min_per_class_accuracy 0.553579 0.639756 203" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "GLM predictions: \n", "H2OFrame with 2691 rows and 3 columns: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
predictYESYESYESYESYESYESYESYESYESYES
p00.40.40.40.50.50.50.50.40.50.5
p10.60.60.60.50.50.50.50.60.50.5
" ], "text/plain": [ "predict YES YES YES YES YES YES YES YES YES YES\n", "--------- -------- -------- -------- -------- -------- -------- -------- -------- -------- --------\n", "p0 0.408076 0.434986 0.413786 0.462743 0.503313 0.506724 0.465098 0.438877 0.489413 0.487458\n", "p1 0.591924 0.565014 0.586214 0.537257 0.496687 0.493276 0.534902 0.561123 0.510587 0.512542" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "GLM performance: \n", "\n", "ModelMetricsBinomialGLM: glm\n", "** Reported on test data. **\n", "\n", "MSE: 0.232028999965\n", "R^2: 0.0633408025091\n", "LogLoss: 0.656433714264\n", "Null degrees of freedom: 2690\n", "Residual degrees of freedom: 2438\n", "Null deviance: 3705.96023003\n", "Residual deviance: 3532.92625017\n", "AIC: 4038.92625017\n", "AUC: 0.656781919193\n", "Gini: 0.313563838386\n", "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.459695105317:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
NOYESErrorRate
NO280.0937.00.7699 (937.0/1217.0)
YES106.01368.00.0719 (106.0/1474.0)
Total386.02305.00.3876 (1043.0/2691.0)
" ], "text/plain": [ " NO YES Error Rate\n", "----- ---- ----- ------- ---------------\n", "NO 280 937 0.7699 (937.0/1217.0)\n", "YES 106 1368 0.0719 (106.0/1474.0)\n", "Total 386 2305 0.3876 (1043.0/2691.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Maximum Metrics: Maximum metrics at their respective thresholds\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
metricthresholdvalueidx
max f10.50.7309.0
max f20.40.9391.0
max f0point50.50.7256.0
max accuracy0.50.6256.0
max precision0.81.00.0
max absolute_MCC0.50.2257.0
max min_per_class_accuracy0.60.6192.0
" ], "text/plain": [ "metric threshold value idx\n", "-------------------------- ----------- -------- -----\n", "max f1 0.459695 0.724001 309\n", "max f2 0.352918 0.859494 391\n", "max f0point5 0.512906 0.655381 256\n", "max accuracy 0.512906 0.631735 256\n", "max precision 0.766709 1 0\n", "max absolute_MCC 0.512335 0.249485 257\n", "max min_per_class_accuracy 0.557716 0.604478 192" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# predicting & performance on test file\n", "gbm_pred = gbm.predict(air_test)\n", "print(\"GBM predictions: \")\n", "gbm_pred.head()\n", "\n", "gbm_perf = gbm.model_performance(air_test)\n", "print(\"GBM performance: \")\n", "gbm_perf.show()\n", "\n", "glm_pred = glm.predict(air_test)\n", "print(\"GLM predictions: \")\n", "glm_pred.head()\n", "\n", "glm_perf = glm.model_performance(air_test)\n", "print(\"GLM performance: \")\n", "glm_perf.show()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.446332927983:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
NOYESErrorRate
NO345.0872.00.7165 (872.0/1217.0)
YES136.01338.00.0923 (136.0/1474.0)
Total481.02210.00.3746 (1008.0/2691.0)
" ], "text/plain": [ " NO YES Error Rate\n", "----- ---- ----- ------- ---------------\n", "NO 345 872 0.7165 (872.0/1217.0)\n", "YES 136 1338 0.0923 (136.0/1474.0)\n", "Total 481 2210 0.3746 (1008.0/2691.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.459695105317:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
NOYESErrorRate
NO280.0937.00.7699 (937.0/1217.0)
YES106.01368.00.0719 (106.0/1474.0)
Total386.02305.00.3876 (1043.0/2691.0)
" ], "text/plain": [ " NO YES Error Rate\n", "----- ---- ----- ------- ---------------\n", "NO 280 937 0.7699 (937.0/1217.0)\n", "YES 106 1368 0.0719 (106.0/1474.0)\n", "Total 386 2305 0.3876 (1043.0/2691.0)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# Building confusion matrix for test set\n", "gbm_CM = gbm_perf.confusion_matrix()\n", "print(gbm_CM)\n", "print\n", "\n", "glm_CM = glm_perf.confusion_matrix()\n", "print(glm_CM)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "GBM Precision: [[0.7017496139979671, 1.0]]\n", "GBM Accuracy: [[0.5351575413437655, 0.6596060943887031]]\n", "GBM AUC: 0.693439503015\n", "\n", "GLM Precision: [[0.7667089295101112, 1.0]]\n", "GLM Accuracy: [[0.512905531794376, 0.63173541434411]]\n", "GLM AUC: 0.656781919193\n" ] } ], "source": [ "# ROC for test set\n", "print('GBM Precision: {0}'.format(gbm_perf.precision()))\n", "print('GBM Accuracy: {0}'.format(gbm_perf.accuracy()))\n", "print('GBM AUC: {0}'.format(gbm_perf.auc()))\n", "print\n", "print('GLM Precision: {0}'.format(glm_perf.precision()))\n", "print('GLM Accuracy: {0}'.format(glm_perf.accuracy()))\n", "print('GLM AUC: {0}'.format(glm_perf.auc()))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }