{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "import h2o\n", "import time\n", "from h2o.estimators.glm import H2OGeneralizedLinearEstimator\n", "from h2o.estimators.gbm import H2OGradientBoostingEstimator\n", "from h2o.estimators.random_forest import H2ORandomForestEstimator\n", "from h2o.estimators.deeplearning import H2ODeepLearningEstimator" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Connecting to H2O server at http://localhost:54321... successful!\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:90: DeprecationWarning: DisplayFormatter._ipython_display_formatter_default is deprecated: use @default decorator instead.\n", " def _ipython_display_formatter_default(self):\n", "/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:96: DeprecationWarning: DisplayFormatter._formatters_default is deprecated: use @default decorator instead.\n", " def _formatters_default(self):\n", "/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:675: DeprecationWarning: PlainTextFormatter._deferred_printers_default is deprecated: use @default decorator instead.\n", " def _deferred_printers_default(self):\n", "/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:667: DeprecationWarning: PlainTextFormatter._singleton_printers_default is deprecated: use @default decorator instead.\n", " def _singleton_printers_default(self):\n", "/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:670: DeprecationWarning: PlainTextFormatter._type_printers_default is deprecated: use @default decorator instead.\n", " def _type_printers_default(self):\n", "/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:667: DeprecationWarning: PlainTextFormatter._singleton_printers_default is deprecated: use @default decorator instead.\n", " def _singleton_printers_default(self):\n", "/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:670: DeprecationWarning: PlainTextFormatter._type_printers_default is deprecated: use @default decorator instead.\n", " def _type_printers_default(self):\n", "/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:675: DeprecationWarning: PlainTextFormatter._deferred_printers_default is deprecated: use @default decorator instead.\n", " def _deferred_printers_default(self):\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
H2O cluster uptime:2 mins 28 secs
H2O cluster version:3.9.1.99999
H2O cluster name:H2O_from_python_navdeepgill_t343ab
H2O cluster total nodes:1
H2O cluster free memory:3.244 Gb
H2O cluster total cores:8
H2O cluster allowed cores:8
H2O cluster is healthy:True
H2O cluster is locked:True
H2O connection url:http://localhost:54321
H2O connection proxy:None
Python version:2.7.11 final
" ], "text/plain": [ "-------------------------- ----------------------------------\n", "H2O cluster uptime: 2 mins 28 secs\n", "H2O cluster version: 3.9.1.99999\n", "H2O cluster name: H2O_from_python_navdeepgill_t343ab\n", "H2O cluster total nodes: 1\n", "H2O cluster free memory: 3.244 Gb\n", "H2O cluster total cores: 8\n", "H2O cluster allowed cores: 8\n", "H2O cluster is healthy: True\n", "H2O cluster is locked: True\n", "H2O connection url: http://localhost:54321\n", "H2O connection proxy:\n", "Python version: 2.7.11 final\n", "-------------------------- ----------------------------------" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Explore a typical Data Science workflow with H2O and Python\n", "#\n", "# Goal: assist the manager of CitiBike of NYC to load-balance the bicycles\n", "# across the CitiBike network of stations, by predicting the number of bike\n", "# trips taken from the station every day. Use 10 million rows of historical\n", "# data, and eventually add weather data.\n", "\n", "\n", "# Connect to a cluster\n", "h2o.init()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.\n", "\n", "# Set this to True if you want to fetch the data directly from S3.\n", "# This is useful if your cluster is running in EC2.\n", "data_source_is_s3 = False\n", "\n", "def mylocate(s):\n", " if data_source_is_s3:\n", " return \"s3n://h2o-public-test-data/\" + s\n", " else:\n", " return _locate(s)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Import and Parse bike data\n", "Warning: Method get_json in class H2OConnection is deprecated.\n" ] }, { "ename": "H2OResponseError", "evalue": "Server error java.lang.IllegalArgumentException:\n Error: AWS Access Key ID and Secret Access Key must be specified as the username or password (respectively) of a s3n URL, or by setting the fs.s3n.awsAccessKeyId or fs.s3n.awsSecretAccessKey properties (respectively).\n Request: GET /3/ImportFiles\n params: {'path': 's3n://h2o-public-test-data/bigdata/laptop/citibike-nyc/2013-10.csv'}\n", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mH2OResponseError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0;31m# totals about 10 million rows\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;32mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Import and Parse bike data\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 25\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh2o\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mimport_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msmall_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/h2o.pyc\u001b[0m in \u001b[0;36mimport_file\u001b[0;34m(path, destination_frame, parse, header, sep, col_names, col_types, na_strings)\u001b[0m\n\u001b[1;32m 336\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 337\u001b[0m return H2OFrame()._import_parse(path, destination_frame, header, sep, col_names,\n\u001b[0;32m--> 338\u001b[0;31m col_types, na_strings)\n\u001b[0m\u001b[1;32m 339\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 340\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/frame.pyc\u001b[0m in \u001b[0;36m_import_parse\u001b[0;34m(self, path, destination_frame, header, separator, column_names, column_types, na_strings)\u001b[0m\n\u001b[1;32m 196\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 197\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_import_parse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdestination_frame\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseparator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_names\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_types\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mna_strings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 198\u001b[0;31m \u001b[0mrawkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh2o\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlazy_import\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 199\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrawkey\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mdestination_frame\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseparator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_names\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_types\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mna_strings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 200\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/h2o.pyc\u001b[0m in \u001b[0;36mlazy_import\u001b[0;34m(path)\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0mA\u001b[0m \u001b[0mpath\u001b[0m \u001b[0mto\u001b[0m \u001b[0ma\u001b[0m \u001b[0mdata\u001b[0m \u001b[0mfile\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mremote\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mlocal\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 214\u001b[0m \"\"\"\n\u001b[0;32m--> 215\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0m_import\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mp\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mlist\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtuple\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0m_import\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 216\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 217\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/h2o.pyc\u001b[0m in \u001b[0;36m_import\u001b[0;34m(path)\u001b[0m\n\u001b[1;32m 217\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_import\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 219\u001b[0;31m \u001b[0mj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh2oconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl_suffix\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"ImportFiles\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 220\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'fails'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"ImportFiles of \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mpath\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\" failed on \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'fails'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 221\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'destination_frames'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/utils/backward_compatibility.pyc\u001b[0m in \u001b[0;36m\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 64\u001b[0m self._bcin = {\n\u001b[1;32m 65\u001b[0m \u001b[0;31m# Creating lambdas in a loop, need to make sure that `fun` is bound to each lambda separately.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 66\u001b[0;31m \u001b[0mname\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mfun\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mfun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfun\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 67\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfun\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mviewitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_bc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"im\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 68\u001b[0m }\n", "\u001b[0;32m/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/connection.pyc\u001b[0m in \u001b[0;36m\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 665\u001b[0m \u001b[0;34m\"post\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0m_deprecated_post\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 666\u001b[0m \u001b[0;34m\"delete\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0m_deprecated_delete\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 667\u001b[0;31m \u001b[0;34m\"get_json\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0m_deprecated_get\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 668\u001b[0m \u001b[0;34m\"post_json\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0m_deprecated_post\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 669\u001b[0m }\n", "\u001b[0;32m/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/connection.pyc\u001b[0m in \u001b[0;36m_deprecated_get\u001b[0;34m(self, url_suffix, **kwargs)\u001b[0m\n\u001b[1;32m 1155\u001b[0m \u001b[0mrestver\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"_rest_version\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"_rest_version\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1156\u001b[0m \u001b[0mendpoint\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"GET /%d/%s\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrestver\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl_suffix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1157\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mendpoint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1159\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_deprecated_post\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl_suffix\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/connection.pyc\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, endpoint, data, json, filename)\u001b[0m\n\u001b[1;32m 232\u001b[0m auth=self._auth, verify=self._verify_ssl_cert, proxies=self._proxies)\n\u001b[1;32m 233\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_log_end_transaction\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstart_time\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 234\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_process_response\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 235\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 236\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexceptions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mConnectionError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexceptions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mHTTPError\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/connection.pyc\u001b[0m in \u001b[0;36m_process_response\u001b[0;34m(response)\u001b[0m\n\u001b[1;32m 585\u001b[0m \u001b[0;31m# Client errors (400 = \"Bad Request\", 404 = \"Not Found\", 412 = \"Precondition Failed\")\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 586\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mstatus_code\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;36m400\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m404\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m412\u001b[0m\u001b[0;34m}\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mH2OErrorV3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mH2OModelBuilderErrorV3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 587\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mH2OResponseError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 588\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 589\u001b[0m \u001b[0;31m# Server errors (notably 500 = \"Server Error\")\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mH2OResponseError\u001b[0m: Server error java.lang.IllegalArgumentException:\n Error: AWS Access Key ID and Secret Access Key must be specified as the username or password (respectively) of a s3n URL, or by setting the fs.s3n.awsAccessKeyId or fs.s3n.awsSecretAccessKey properties (respectively).\n Request: GET /3/ImportFiles\n params: {'path': 's3n://h2o-public-test-data/bigdata/laptop/citibike-nyc/2013-10.csv'}\n" ] } ], "source": [ "# Pick either the big or the small demo.\n", "# Big data is 10M rows\n", "small_test = [mylocate(\"bigdata/laptop/citibike-nyc/2013-10.csv\")]\n", "big_test = [mylocate(\"bigdata/laptop/citibike-nyc/2013-07.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-08.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-09.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-10.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-11.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-12.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-01.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-02.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-03.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-04.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-05.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-06.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-07.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-08.csv\")]\n", "\n", "# ----------\n", "\n", "# 1- Load data - 1 row per bicycle trip. Has columns showing the start and end\n", "# station, trip duration and trip start time and day. The larger dataset\n", "# totals about 10 million rows\n", "print(\"Import and Parse bike data\")\n", "data = h2o.import_file(path=small_test)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows:1,037,712 Cols:16\n", "\n", "Chunk compression summary:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers172.2135415 1.3 KB0.0022872
C11-Byte Integers486.25 1016.6 KB1.7506603
C1N1-Byte Integers (w/o NAs)486.25 1016.6 KB1.7506603
C1S1-Byte Fractions7910.286459 1.6 MB2.8878725
C22-Byte Integers24331.640625 10.0 MB17.696283
C2S2-Byte Fractions496.3802085 2.0 MB3.5701983
C44-Byte Integers324.166667 2.6 MB4.6726856
C4S4-Byte Fractions395.078125 3.2 MB5.6373096
C864-bit Integers607.8125 9.9 MB17.432673
C8D64-bit Reals15319.921875 25.3 MB44.59937
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ --------- -----------------\n", "C0L Constant Integers 17 2.21354 1.3 KB 0.00228718\n", "C1 1-Byte Integers 48 6.25 1016.6 KB 1.75066\n", "C1N 1-Byte Integers (w/o NAs) 48 6.25 1016.6 KB 1.75066\n", "C1S 1-Byte Fractions 79 10.2865 1.6 MB 2.88787\n", "C2 2-Byte Integers 243 31.6406 10.0 MB 17.6963\n", "C2S 2-Byte Fractions 49 6.38021 2.0 MB 3.5702\n", "C4 4-Byte Integers 32 4.16667 2.6 MB 4.67269\n", "C4S 4-Byte Fractions 39 5.07812 3.2 MB 5.63731\n", "C8 64-bit Integers 60 7.8125 9.9 MB 17.4327\n", "C8D 64-bit Reals 153 19.9219 25.3 MB 44.5994" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.61:54321 56.7 MB1037712.048.0768.0
mean 56.7 MB1037712.048.0768.0
min 56.7 MB1037712.048.0768.0
max 56.7 MB1037712.048.0768.0
stddev 0 B0.00.00.0
total 56.7 MB1037712.048.0768.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- ------- ---------------- ----------------------------- ------------------\n", "172.16.2.61:54321 56.7 MB 1.03771e+06 48 768\n", "mean 56.7 MB 1.03771e+06 48 768\n", "min 56.7 MB 1.03771e+06 48 768\n", "max 56.7 MB 1.03771e+06 48 768\n", "stddev 0 B 0 0 0\n", "total 56.7 MB 1.03771e+06 48 768" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
tripduration starttime stoptime start station id start station name start station latitude start station longitude end station id end station name end station latitude end station longitude bikeid usertype birth year gender Days
type int time time int enum real real int enum real real int enum int int int
mins 60.0 1.380610868e+12 1.380611083e+12 72.0 0.0 40.680342423 -74.01713445 72.0 0.0 40.680342423 -74.01713445 14529.0 0.0 1899.0 0.0 15979.0
mean 825.614754383 1.38191371692e+121.38191454253e+12443.714212614 NaN 40.7345188586 -73.9911328848 443.207421712 NaN 40.7342847885 -73.9912702982 17644.07164510.9060953328091975.778394861.12375591686 15993.8523906
maxs 1259480.0 1.383289197e+12 1.38341851e+12 3002.0 329.0 40.770513 -73.9500479759 3002.0 329.0 40.770513 -73.9500479759 20757.0 1.0 1997.0 2.0 16010.0
sigma 2000.3732323 778871729.132 778847387.503 354.434325075 NaN 0.0195734073053 0.0123161234106 357.398217058 NaN 0.0195578458116 0.0123855811965 1717.681121340.29169618212311.13149062380.5443805932919.02215033588
zeros 0 0 0 0 5239 0 0 0 5449 0 0 0 97446 0 97498 0
missing0 0 0 0 0 0 0 0 0 0 0 0 0 97445 0 0
0 326.0 1.380610868e+12 1.380611194e+12 239.0 Willoughby St & Fleet St40.69196566 -73.9813018 366.0 Clinton Ave & Myrtle Ave 40.693261 -73.968896 16052.0 Subscriber 1982.0 1.0 15979.0
1 729.0 1.380610881e+12 1.38061161e+12 322.0 Clinton St & Tillary St 40.696192 -73.991218 398.0 Atlantic Ave & Furman St 40.69165183 -73.9999786 19412.0 Customer nan 0.0 15979.0
2 520.0 1.380610884e+12 1.380611404e+12 174.0 E 25 St & 1 Ave 40.7381765 -73.97738662 403.0 E 2 St & 2 Ave 40.72502876 -73.99069656 19645.0 Subscriber 1984.0 1.0 15979.0
3 281.0 1.380610885e+12 1.380611166e+12 430.0 York St & Jay St 40.7014851 -73.98656928 323.0 Lawrence St & Willoughby St 40.69236178 -73.98631746 16992.0 Subscriber 1985.0 1.0 15979.0
4 196.0 1.380610887e+12 1.380611083e+12 403.0 E 2 St & 2 Ave 40.72502876 -73.99069656 401.0 Allen St & Rivington St 40.72019576 -73.98997825 15690.0 Subscriber 1986.0 1.0 15979.0
5 1948.0 1.380610908e+12 1.380612856e+12 369.0 Washington Pl & 6 Ave 40.73224119 -74.00026394 307.0 Canal St & Rutgers St 40.71427487 -73.98990025 19846.0 Subscriber 1977.0 1.0 15979.0
6 1327.0 1.380610908e+12 1.380612235e+12 254.0 W 11 St & 6 Ave 40.73532427 -73.99800419 539.0 Metropolitan Ave & Bedford Ave40.71534825 -73.96024116 14563.0 Subscriber 1986.0 2.0 15979.0
7 1146.0 1.380610917e+12 1.380612063e+12 490.0 8 Ave & W 33 St 40.751551 -73.993934 438.0 St Marks Pl & 1 Ave 40.72779126 -73.98564945 16793.0 Subscriber 1959.0 1.0 15979.0
8 380.0 1.380610918e+12 1.380611298e+12 468.0 Broadway & W 55 St 40.7652654 -73.98192338 385.0 E 55 St & 2 Ave 40.75797322 -73.96603308 16600.0 Customer nan 0.0 15979.0
9 682.0 1.380610925e+12 1.380611607e+12 300.0 Shevchenko Pl & E 6 St 40.728145 -73.990214 519.0 Pershing Square N 40.75188406 -73.97770164 15204.0 Subscriber 1992.0 1.0 15979.0
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# ----------\n", "\n", "# 2- light data munging: group the bike starts per-day, converting the 10M rows\n", "# of trips to about 140,000 station&day combos - predicting the number of trip\n", "# starts per-station-per-day.\n", "\n", "# Convert start time to: Day since the Epoch\n", "startime = data[\"starttime\"]\n", "secsPerDay=1000*60*60*24\n", "data[\"Days\"] = (startime/secsPerDay).floor()\n", "data.describe()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Daysstart station name bikes
159791 Ave & E 15 St 97
159791 Ave & E 18 St 75
159791 Ave & E 30 St 113
1597910 Ave & W 28 St 74
1597911 Ave & W 27 St 139
1597911 Ave & W 41 St 60
1597912 Ave & W 40 St 90
159792 Ave & E 31 St 88
159792 Ave & E 58 St 55
159793 Ave & Schermerhorn St 8
" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Rows:10,450 Cols:3\n", "\n", "Chunk compression summary:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers11.0416667 80 B0.1364815
C1N1-Byte Integers (w/o NAs)11.0416667 412 B0.7028798
C1S1-Byte Fractions3132.291664 12.4 KB21.714207
C22-Byte Integers6365.625 44.3 KB77.446434
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ ------- -----------------\n", "C0L Constant Integers 1 1.04167 80 B 0.136482\n", "C1N 1-Byte Integers (w/o NAs) 1 1.04167 412 B 0.70288\n", "C1S 1-Byte Fractions 31 32.2917 12.4 KB 21.7142\n", "C2 2-Byte Integers 63 65.625 44.3 KB 77.4464" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.61:54321 57.2 KB10450.032.096.0
mean 57.2 KB10450.032.096.0
min 57.2 KB10450.032.096.0
max 57.2 KB10450.032.096.0
stddev 0 B0.00.00.0
total 57.2 KB10450.032.096.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- ------- ---------------- ----------------------------- ------------------\n", "172.16.2.61:54321 57.2 KB 10450 32 96\n", "mean 57.2 KB 10450 32 96\n", "min 57.2 KB 10450 32 96\n", "max 57.2 KB 10450 32 96\n", "stddev 0 B 0 0 0\n", "total 57.2 KB 10450 32 96" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Days start station name bikes
type int enum int
mins 15979.0 0.0 1.0
mean 15994.4415311NaN 99.3025837321
maxs 16010.0 329.0 553.0
sigma 9.23370172444NaN 72.9721964301
zeros 0 32 0
missing0 0 0
0 15979.0 1 Ave & E 15 St 97.0
1 15979.0 1 Ave & E 18 St 75.0
2 15979.0 1 Ave & E 30 St 113.0
3 15979.0 10 Ave & W 28 St 74.0
4 15979.0 11 Ave & W 27 St 139.0
5 15979.0 11 Ave & W 41 St 60.0
6 15979.0 12 Ave & W 40 St 90.0
7 15979.0 2 Ave & E 31 St 88.0
8 15979.0 2 Ave & E 58 St 55.0
9 15979.0 3 Ave & Schermerhorn St8.0
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[10450, 3]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Now do a monster Group-By. Count bike starts per-station per-day. Ends up\n", "# with about 340 stations times 400 days (140,000 rows). This is what we want\n", "# to predict.\n", "grouped = data.group_by([\"Days\",\"start station name\"])\n", "bpd = grouped.count().get_frame() # Compute bikes-per-day\n", "bpd.set_name(2,\"bikes\")\n", "bpd.show()\n", "bpd.describe()\n", "bpd.dim" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quantiles of bikes-per-day\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Probs bikesQuantiles
0.01 4.49
0.1 19
0.25 43
0.333 57
0.5 87
0.667 118
0.75 137
0.9 192
0.99 334.51
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Quantiles: the data is fairly unbalanced; some station/day combos are wildly\n", "# more popular than others.\n", "print(\"Quantiles of bikes-per-day\")\n", "bpd[\"bikes\"].quantile().show()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Bikes-Per-Day\n", "Rows:10,450 Cols:5\n", "\n", "Chunk compression summary:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers3320.625 2.6 KB3.6613781
CBSBits63.7500002 666 B0.9236658
C1N1-Byte Integers (w/o NAs)2716.875 10.4 KB14.803617
C1S1-Byte Fractions3119.375 12.4 KB17.65228
C22-Byte Integers6339.375 44.3 KB62.959057
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ ------- -----------------\n", "C0L Constant Integers 33 20.625 2.6 KB 3.66138\n", "CBS Bits 6 3.75 666 B 0.923666\n", "C1N 1-Byte Integers (w/o NAs) 27 16.875 10.4 KB 14.8036\n", "C1S 1-Byte Fractions 31 19.375 12.4 KB 17.6523\n", "C2 2-Byte Integers 63 39.375 44.3 KB 62.9591" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.61:54321 70.4 KB10450.032.0160.0
mean 70.4 KB10450.032.0160.0
min 70.4 KB10450.032.0160.0
max 70.4 KB10450.032.0160.0
stddev 0 B0.00.00.0
total 70.4 KB10450.032.0160.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- ------- ---------------- ----------------------------- ------------------\n", "172.16.2.61:54321 70.4 KB 10450 32 160\n", "mean 70.4 KB 10450 32 160\n", "min 70.4 KB 10450 32 160\n", "max 70.4 KB 10450 32 160\n", "stddev 0 B 0 0 0\n", "total 70.4 KB 10450 32 160" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Days start station name bikes Month DayOfWeek
type int enum int enum enum
mins 15979.0 0.0 1.0 0.0 0.0
mean 15994.4415311NaN 99.30258373210.968612440191NaN
maxs 16010.0 329.0 553.0 1.0 6.0
sigma 9.23370172444NaN 72.97219643010.174371128617NaN
zeros 0 32 0 328 1635
missing0 0 0 0 0
0 15979.0 1 Ave & E 15 St 97.0 9 Mon
1 15979.0 1 Ave & E 18 St 75.0 9 Mon
2 15979.0 1 Ave & E 30 St 113.0 9 Mon
3 15979.0 10 Ave & W 28 St 74.0 9 Mon
4 15979.0 11 Ave & W 27 St 139.0 9 Mon
5 15979.0 11 Ave & W 41 St 60.0 9 Mon
6 15979.0 12 Ave & W 40 St 90.0 9 Mon
7 15979.0 2 Ave & E 31 St 88.0 9 Mon
8 15979.0 2 Ave & E 58 St 55.0 9 Mon
9 15979.0 3 Ave & Schermerhorn St8.0 9 Mon
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# A little feature engineering\n", "# Add in month-of-year (seasonality; fewer bike rides in winter than summer)\n", "secs = bpd[\"Days\"]*secsPerDay\n", "bpd[\"Month\"] = secs.month().asfactor()\n", "# Add in day-of-week (work-week; more bike rides on Sunday than Monday)\n", "bpd[\"DayOfWeek\"] = secs.dayOfWeek()\n", "print(\"Bikes-Per-Day\")\n", "bpd.describe()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# ----------\n", "# 3- Fit a model on train; using test as validation\n", "\n", "# Function for doing class test/train/holdout split\n", "def split_fit_predict(data):\n", " global gbm0,drf0,glm0,dl0\n", " # Classic Test/Train split\n", " r = data['Days'].runif() # Random UNIForm numbers, one per row\n", " train = data[ r < 0.6]\n", " test = data[(0.6 <= r) & (r < 0.9)]\n", " hold = data[ 0.9 <= r ]\n", " print(\"Training data has\",train.ncol,\"columns and\",train.nrow,\"rows, test has\",test.nrow,\"rows, holdout has\",hold.nrow)\n", " bike_names_x = data.names\n", " bike_names_x.remove(\"bikes\")\n", " \n", " # Run GBM\n", " s = time.time()\n", " \n", " gbm0 = H2OGradientBoostingEstimator(ntrees=500, # 500 works well\n", " max_depth=6,\n", " learn_rate=0.1)\n", " \n", "\n", " gbm0.train(x =bike_names_x,\n", " y =\"bikes\",\n", " training_frame =train,\n", " validation_frame=test)\n", "\n", " gbm_elapsed = time.time() - s\n", "\n", " # Run DRF\n", " s = time.time()\n", " \n", " drf0 = H2ORandomForestEstimator(ntrees=250, max_depth=30)\n", "\n", " drf0.train(x =bike_names_x,\n", " y =\"bikes\",\n", " training_frame =train,\n", " validation_frame=test)\n", " \n", " drf_elapsed = time.time() - s \n", " \n", " \n", " # Run GLM\n", " if \"WC1\" in bike_names_x: bike_names_x.remove(\"WC1\")\n", " s = time.time()\n", "\n", " glm0 = H2OGeneralizedLinearEstimator(Lambda=[1e-5], family=\"poisson\")\n", " \n", " glm0.train(x =bike_names_x,\n", " y =\"bikes\",\n", " training_frame =train,\n", " validation_frame=test)\n", "\n", " glm_elapsed = time.time() - s\n", " \n", " # Run DL\n", " s = time.time()\n", "\n", " dl0 = H2ODeepLearningEstimator(hidden=[50,50,50,50], epochs=50)\n", " \n", " dl0.train(x =bike_names_x,\n", " y =\"bikes\",\n", " training_frame =train,\n", " validation_frame=test)\n", " \n", " dl_elapsed = time.time() - s\n", " \n", " # ----------\n", " # 4- Score on holdout set & report\n", " train_mse_gbm = gbm0.model_performance(train).mse()\n", " test_mse_gbm = gbm0.model_performance(test ).mse()\n", " hold_mse_gbm = gbm0.model_performance(hold ).mse()\n", "# print \"GBM mse TRAIN=\",train_mse_gbm,\", mse TEST=\",test_mse_gbm,\", mse HOLDOUT=\",hold_mse_gbm\n", " \n", " train_mse_drf = drf0.model_performance(train).mse()\n", " test_mse_drf = drf0.model_performance(test ).mse()\n", " hold_mse_drf = drf0.model_performance(hold ).mse()\n", "# print \"DRF mse TRAIN=\",train_mse_drf,\", mse TEST=\",test_mse_drf,\", mse HOLDOUT=\",hold_mse_drf\n", " \n", " train_mse_glm = glm0.model_performance(train).mse()\n", " test_mse_glm = glm0.model_performance(test ).mse()\n", " hold_mse_glm = glm0.model_performance(hold ).mse()\n", "# print \"GLM mse TRAIN=\",train_mse_glm,\", mse TEST=\",test_mse_glm,\", mse HOLDOUT=\",hold_mse_glm\n", " \n", " train_mse_dl = dl0.model_performance(train).mse()\n", " test_mse_dl = dl0.model_performance(test ).mse()\n", " hold_mse_dl = dl0.model_performance(hold ).mse()\n", "# print \" DL mse TRAIN=\",train_mse_dl,\", mse TEST=\",test_mse_dl,\", mse HOLDOUT=\",hold_mse_dl\n", " \n", " # make a pretty HTML table printout of the results\n", "\n", " header = [\"Model\", \"mse TRAIN\", \"mse TEST\", \"mse HOLDOUT\", \"Model Training Time (s)\"]\n", " table = [\n", " [\"GBM\", train_mse_gbm, test_mse_gbm, hold_mse_gbm, round(gbm_elapsed,3)],\n", " [\"DRF\", train_mse_drf, test_mse_drf, hold_mse_drf, round(drf_elapsed,3)],\n", " [\"GLM\", train_mse_glm, test_mse_glm, hold_mse_glm, round(glm_elapsed,3)],\n", " [\"DL \", train_mse_dl, test_mse_dl, hold_mse_dl , round(dl_elapsed,3) ],\n", " ]\n", " h2o.display.H2ODisplay(table,header)\n", " # --------------" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training data has 5 columns and 6172 rows, test has 3238 rows, holdout has 1040\n", "\n", "gbm Model Build Progress: [##################################################] 100%\n", "\n", "drf Model Build Progress: [##################################################] 100%\n", "\n", "glm Model Build Progress: [##################################################] 100%\n", "\n", "deeplearning Model Build Progress: [##################################################] 100%\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
ModelR2 TRAINR2 TESTR2 HOLDOUTModel Training Time (s)
GBM0.99769810.92748210.91832675.612
DRF0.82942740.76944960.76110635.607
GLM0.85972080.84654290.84479660.14
DL 0.95469430.91158800.89780016.845
" ], "text/plain": [ "Model R2 TRAIN R2 TEST R2 HOLDOUT Model Training Time (s)\n", "------- ---------- --------- ------------ -------------------------\n", "GBM 0.997698 0.927482 0.918327 5.612\n", "DRF 0.829427 0.76945 0.761106 5.607\n", "GLM 0.859721 0.846543 0.844797 0.14\n", "DL 0.954694 0.911588 0.8978 6.845" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Split the data (into test & train), fit some models and predict on the holdout data\n", "split_fit_predict(bpd)\n", "# Here we see an r^2 of 0.91 for GBM, and 0.71 for GLM. This means given just\n", "# the station, the month, and the day-of-week we can predict 90% of the\n", "# variance of the bike-trip-starts." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Parse Progress: [##################################################] 100%\n", "Rows:17,520 Cols:50\n", "\n", "Chunk compression summary:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers1076.294118 8.4 KB0.7889721
C0DConstant Reals43625.647058 34.1 KB3.2148771
CXISparse Integers171.0 1.5 KB0.1399135
C11-Byte Integers34620.352942 197.4 KB18.634672
C1N1-Byte Integers (w/o NAs)21412.588236 122.3 KB11.544063
C1S1-Byte Fractions21412.588236 125.3 KB11.822968
C2S2-Byte Fractions19611.529412 214.5 KB20.242111
C4S4-Byte Fractions17010.0 356.1 KB33.612423
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ -------- -----------------\n", "C0L Constant Integers 107 6.29412 8.4 KB 0.788972\n", "C0D Constant Reals 436 25.6471 34.1 KB 3.21488\n", "CXI Sparse Integers 17 1 1.5 KB 0.139914\n", "C1 1-Byte Integers 346 20.3529 197.4 KB 18.6347\n", "C1N 1-Byte Integers (w/o NAs) 214 12.5882 122.3 KB 11.5441\n", "C1S 1-Byte Fractions 214 12.5882 125.3 KB 11.823\n", "C2S 2-Byte Fractions 196 11.5294 214.5 KB 20.2421\n", "C4S 4-Byte Fractions 170 10 356.1 KB 33.6124" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.61:54321 1.0 MB17520.034.01700.0
mean 1.0 MB17520.034.01700.0
min 1.0 MB17520.034.01700.0
max 1.0 MB17520.034.01700.0
stddev 0 B0.00.00.0
total 1.0 MB17520.034.01700.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- ------ ---------------- ----------------------------- ------------------\n", "172.16.2.61:54321 1.0 MB 17520 34 1700\n", "mean 1.0 MB 17520 34 1700\n", "min 1.0 MB 17520 34 1700\n", "max 1.0 MB 17520 34 1700\n", "stddev 0 B 0 0 0\n", "total 1.0 MB 17520 34 1700" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Year Local Month Local Day Local Hour Local Year UTC Month UTC Day UTC Hour UTC Cavok Reported Cloud Ceiling (m) Cloud Cover Fraction Cloud Cover Fraction 1 Cloud Cover Fraction 2 Cloud Cover Fraction 3 Cloud Cover Fraction 4 Cloud Cover Fraction 5 Cloud Cover Fraction 6 Cloud Height (m) 1 Cloud Height (m) 2 Cloud Height (m) 3 Cloud Height (m) 4 Cloud Height (m) 5 Cloud Height (m) 6 Dew Point (C) Humidity Fraction Precipitation One Hour (mm) Pressure Altimeter (mbar) Pressure Sea Level (mbar) Pressure Station (mbar) Snow Depth (cm) Temperature (C) Visibility (km) Weather Code 1 Weather Code 1/ Description Weather Code 2 Weather Code 2/ Description Weather Code 3 Weather Code 3/ Description Weather Code 4 Weather Code 4/ Description Weather Code 5 Weather Code 5/ Description Weather Code 6 Weather Code 6/ Description Weather Code Most Severe / Icon Code Weather Code Most Severe Weather Code Most Severe / Description Wind Direction (degrees) Wind Gust (m/s) Wind Speed (m/s)
type int int int int int int int int int real real real real real int int int real real real int int int real real real real int int int real real int enum int enum int enum int enum int enum int enum int int enum int real real
mins 2013.0 1.0 1.0 0.0 2013.0 1.0 1.0 0.0 0.0 61.0 0.0 0.0 0.25 0.5 NaN NaN NaN 60.96 213.36 365.76 NaN NaN NaN -26.7 0.1251 0.0 983.2949 NaN NaN NaN -15.6 0.001 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 3.0 0.0 0.0 1.0 0.0 10.0 7.2 0.0
mean 2013.5 6.5260273972615.720547945211.5 2013.50057078 6.5251141552515.721347032 11.50011415530.0 1306.31195846 0.416742490522 0.361207349081 0.872445384073 0.963045685279 0.0 0.0 0.0 1293.9822682 1643.73900166 2084.89386376 0.0 0.0 0.0 4.31304646766 0.596736389159 1.37993010753 1017.82581441 0.0 0.0 0.0 12.5789090701 14.3914429682 4.84251968504 NaN 3.65867689358 NaN 2.84660766962 NaN 2.01149425287 NaN 4.125 NaN 3.0 0.0 1.37848173516 4.84251968504 NaN 194.69525682 9.42216948073 2.41032887849
maxs 2014.0 12.0 31.0 23.0 2015.0 12.0 31.0 23.0 0.0 3657.6 1.0 1.0 1.0 1.0 NaN NaN NaN 3657.5999 3657.5999 3657.5999 NaN NaN NaN 24.4 1.0 26.924 1042.2113 NaN NaN NaN 36.1 16.0934 60.0 11.0 60.0 10.0 36.0 7.0 27.0 4.0 27.0 2.0 3.0 0.0 16.0 60.0 11.0 360.0 20.58 10.8
sigma 0.5000142700173.447949723858.796498048526.922384111880.5005844117163.447824054588.795614888686.922301652030.0 995.339856966 0.462720830993 0.42770569708 0.197155690367 0.0861015598104 -0.0 -0.0 -0.0 962.743095854 916.73861349 887.215847511 -0.0 -0.0 -0.0 10.9731282097 0.185792011866 2.56215129179 7.46451697179 -0.0 -0.0 -0.0 10.0396739531 3.69893623033 5.70486576983 NaN 6.13386253912 NaN 5.80553286364 NaN 3.12340844261 NaN 6.15223536611 NaN 0.0 0.0 4.07386062702 5.70486576983 NaN 106.350000031 1.81511871115 1.61469790524
zeros 0 0 0 730 0 0 0 730 17455 0 8758 8758 0 0 -17520 -17520 -17520 0 0 0 -17520 -17520 -17520 268 0 501 0 -17520 -17520 -17520 269 0 0 17 0 30 0 13 -5044 -5024 -11241 -11229 -17030 -17028 14980 0 17 0 0 2768
missing0 0 0 0 0 0 0 0 65 10780 375 375 14682 16535 17520 17520 17520 9103 14683 16535 17520 17520 17520 67 67 15660 360 17520 17520 17520 67 412 14980 14980 16477 16477 17181 17181 17433 17433 17504 17504 17518 17518 0 14980 14980 9382 14381 1283
0 2013.0 1.0 1.0 0.0 2013.0 1.0 1.0 5.0 0.0 2895.6 1.0 0.9 1.0 nan nan nan nan 2895.5999 3352.8 nan nan nan nan -5.0 0.5447 nan 1013.0917 nan nan nan 3.3 16.0934 nan nan nan nan nan nan 0.0 nan nan nan 2.57
1 2013.0 1.0 1.0 1.0 2013.0 1.0 1.0 6.0 0.0 3048.0 1.0 1.0 nan nan nan nan nan 3048.0 nan nan nan nan nan -4.4 0.5463 nan 1012.0759 nan nan nan 3.9 16.0934 nan nan nan nan nan nan 0.0 nan 260.0 9.77 4.63
2 2013.0 1.0 1.0 2.0 2013.0 1.0 1.0 7.0 0.0 1828.8 1.0 1.0 nan nan nan nan nan 1828.7999 nan nan nan nan nan -3.3 0.619 nan 1012.4145 nan nan nan 3.3 16.0934 nan nan nan nan nan nan 0.0 nan nan 7.72 1.54
3 2013.0 1.0 1.0 3.0 2013.0 1.0 1.0 8.0 0.0 1463.0 1.0 1.0 nan nan nan nan nan 1463.04 nan nan nan nan nan -2.8 0.6159 nan 1012.4145 nan nan nan 3.9 16.0934 nan nan nan nan nan nan 0.0 nan nan nan 3.09
4 2013.0 1.0 1.0 4.0 2013.0 1.0 1.0 9.0 0.0 1402.1 1.0 1.0 nan nan nan nan nan 1402.08 nan nan nan nan nan -2.8 0.6159 nan 1012.7531 nan nan nan 3.9 16.0934 nan nan nan nan nan nan 0.0 nan 260.0 nan 4.12
5 2013.0 1.0 1.0 5.0 2013.0 1.0 1.0 10.0 0.0 1524.0 1.0 1.0 nan nan nan nan nan 1524.0 nan nan nan nan nan -2.8 0.6159 nan 1012.4145 nan nan nan 3.9 16.0934 nan nan nan nan nan nan 0.0 nan nan nan 3.09
6 2013.0 1.0 1.0 6.0 2013.0 1.0 1.0 11.0 0.0 1524.0 1.0 1.0 nan nan nan nan nan 1524.0 nan nan nan nan nan -3.3 0.5934 nan 1012.0759 nan nan nan 3.9 16.0934 nan nan nan nan nan nan 0.0 nan nan 9.26 3.09
7 2013.0 1.0 1.0 7.0 2013.0 1.0 1.0 12.0 0.0 1524.0 1.0 1.0 nan nan nan nan nan 1524.0 nan nan nan nan nan -3.3 0.5934 nan 1012.4145 nan nan nan 3.9 16.0934 nan nan nan nan nan nan 0.0 nan 260.0 9.26 4.63
8 2013.0 1.0 1.0 8.0 2013.0 1.0 1.0 13.0 0.0 1524.0 1.0 1.0 nan nan nan nan nan 1524.0 nan nan nan nan nan -2.8 0.6425 nan 1012.4145 nan nan nan 3.3 16.0934 nan nan nan nan nan nan 0.0 nan 260.0 nan 3.09
9 2013.0 1.0 1.0 9.0 2013.0 1.0 1.0 14.0 0.0 1524.0 1.0 0.9 1.0 nan nan nan nan 1524.0 3657.5999 nan nan nan nan -2.8 0.6159 nan 1012.4145 nan nan nan 3.9 16.0934 nan nan nan nan nan nan 0.0 nan nan 9.26 3.09
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# ----------\n", "# 5- Now lets add some weather\n", "# Load weather data\n", "wthr1 = h2o.import_file(path=[mylocate(\"bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv\")])\n", "# Peek at the data\n", "wthr1.describe()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows:17,520 Cols:9\n", "\n", "Chunk compression summary:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers4615.0326805 3.6 KB1.780005
C11-Byte Integers3411.111112 19.4 KB9.592678
C1N1-Byte Integers (w/o NAs)9029.411766 51.5 KB25.494701
C1S1-Byte Fractions4213.725491 24.0 KB11.894592
C2S2-Byte Fractions9430.718956 103.4 KB51.238026
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ -------- -----------------\n", "C0L Constant Integers 46 15.0327 3.6 KB 1.78001\n", "C1 1-Byte Integers 34 11.1111 19.4 KB 9.59268\n", "C1N 1-Byte Integers (w/o NAs) 90 29.4118 51.5 KB 25.4947\n", "C1S 1-Byte Fractions 42 13.7255 24.0 KB 11.8946\n", "C2S 2-Byte Fractions 94 30.719 103.4 KB 51.238" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.61:54321 201.9 KB17520.034.0306.0
mean 201.9 KB17520.034.0306.0
min 201.9 KB17520.034.0306.0
max 201.9 KB17520.034.0306.0
stddev 0 B0.00.00.0
total 201.9 KB17520.034.0306.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- -------- ---------------- ----------------------------- ------------------\n", "172.16.2.61:54321 201.9 KB 17520 34 306\n", "mean 201.9 KB 17520 34 306\n", "min 201.9 KB 17520 34 306\n", "max 201.9 KB 17520 34 306\n", "stddev 0 B 0 0 0\n", "total 201.9 KB 17520 34 306" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Year Local Month Local Day Local Hour Local Dew Point (C) Humidity Fraction Rain (mm) Temperature (C) WC1
type int int int int real real real real enum
mins 2013.0 1.0 1.0 0.0 -26.7 0.1251 0.0 -15.6 0.0
mean 2013.5 6.5260273972615.720547945211.5 4.31304646766 0.596736389159 1.3799301075312.5789090701 NaN
maxs 2014.0 12.0 31.0 23.0 24.4 1.0 26.924 36.1 11.0
sigma 0.5000142700173.447949723858.796498048526.9223841118810.9731282097 0.185792011866 2.5621512917910.0396739531 NaN
zeros 0 0 0 730 268 0 501 269 17
missing0 0 0 0 67 67 15660 67 14980
0 2013.0 1.0 1.0 0.0 -5.0 0.5447 nan 3.3
1 2013.0 1.0 1.0 1.0 -4.4 0.5463 nan 3.9
2 2013.0 1.0 1.0 2.0 -3.3 0.619 nan 3.3
3 2013.0 1.0 1.0 3.0 -2.8 0.6159 nan 3.9
4 2013.0 1.0 1.0 4.0 -2.8 0.6159 nan 3.9
5 2013.0 1.0 1.0 5.0 -2.8 0.6159 nan 3.9
6 2013.0 1.0 1.0 6.0 -3.3 0.5934 nan 3.9
7 2013.0 1.0 1.0 7.0 -3.3 0.5934 nan 3.9
8 2013.0 1.0 1.0 8.0 -2.8 0.6425 nan 3.3
9 2013.0 1.0 1.0 9.0 -2.8 0.6159 nan 3.9
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Lots of columns in there! Lets plan on converting to time-since-epoch to do\n", "# a 'join' with the bike data, plus gather weather info that might affect\n", "# cyclists - rain, snow, temperature. Alas, drop the \"snow\" column since it's\n", "# all NA's. Also add in dew point and humidity just in case. Slice out just\n", "# the columns of interest and drop the rest.\n", "wthr2 = wthr1[[\"Year Local\",\"Month Local\",\"Day Local\",\"Hour Local\",\"Dew Point (C)\",\"Humidity Fraction\",\"Precipitation One Hour (mm)\",\"Temperature (C)\",\"Weather Code 1/ Description\"]]\n", "\n", "wthr2.set_name(wthr2.names.index(\"Precipitation One Hour (mm)\"), \"Rain (mm)\")\n", "wthr2.set_name(wthr2.names.index(\"Weather Code 1/ Description\"), \"WC1\")\n", "wthr2.describe()\n", "# Much better! " ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Filter down to the weather at Noon\n", "wthr3 = wthr2[ wthr2[\"Hour Local\"]==12 ]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows:730 Cols:11\n", "\n", "Chunk compression summary:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers8021.390373 6.3 KB12.498779
C0DConstant Reals133.4759357 1.0 KB2.0310516
C11-Byte Integers308.021391 2.6 KB5.2455816
C1N1-Byte Integers (w/o NAs)5614.973262 4.9 KB9.801778
C1S1-Byte Fractions349.090909 3.5 KB7.0032225
C2S2-Byte Fractions349.090909 4.2 KB8.4288645
CUDUnique Reals256.6844916 3.6 KB7.2297626
C8D64-bit Reals10227.272728 23.9 KB47.76096
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ ------- -----------------\n", "C0L Constant Integers 80 21.3904 6.3 KB 12.4988\n", "C0D Constant Reals 13 3.47594 1.0 KB 2.03105\n", "C1 1-Byte Integers 30 8.02139 2.6 KB 5.24558\n", "C1N 1-Byte Integers (w/o NAs) 56 14.9733 4.9 KB 9.80178\n", "C1S 1-Byte Fractions 34 9.09091 3.5 KB 7.00322\n", "C2S 2-Byte Fractions 34 9.09091 4.2 KB 8.42886\n", "CUD Unique Reals 25 6.68449 3.6 KB 7.22976\n", "C8D 64-bit Reals 102 27.2727 23.9 KB 47.761" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.61:54321 50.0 KB730.034.0374.0
mean 50.0 KB730.034.0374.0
min 50.0 KB730.034.0374.0
max 50.0 KB730.034.0374.0
stddev 0 B0.00.00.0
total 50.0 KB730.034.0374.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- ------- ---------------- ----------------------------- ------------------\n", "172.16.2.61:54321 50.0 KB 730 34 374\n", "mean 50.0 KB 730 34 374\n", "min 50.0 KB 730 34 374\n", "max 50.0 KB 730 34 374\n", "stddev 0 B 0 0 0\n", "total 50.0 KB 730 34 374" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Year Local Month Local Day Local Hour Local Dew Point (C) Humidity Fraction Rain (mm) Temperature (C) WC1 msec Days
type int int int int real real real real enum int int
mins 2013.0 1.0 1.0 12.0 -26.7 0.1723 0.0 -13.9 0.0 1.3570704e+12 15706.0
mean 2013.5 6.5260273972615.720547945212.0 4.23012379642 0.539728198074 1.5312571428614.0687757909 NaN 1.3885608526e+1216070.5
maxs 2014.0 12.0 31.0 12.0 23.3 1.0 12.446 34.4 10.0 1.420056e+12 16435.0
sigma 0.5003428180043.450215293078.802278027010.0 11.1062964725 0.179945027923 2.3606424861510.3989855149 NaN 18219740080.4 210.877136425
zeros 0 0 0 0 14 0 -174 7 -83 0 0
missing0 0 0 0 3 3 660 3 620 0 0
0 2013.0 1.0 1.0 12.0 -3.3 0.5934 nan 3.9 1.3570704e+12 15706.0
1 2013.0 1.0 2.0 12.0 -11.7 0.4806 nan -2.2 1.3571568e+12 15707.0
2 2013.0 1.0 3.0 12.0 -10.6 0.5248 nan -2.2 1.3572432e+12 15708.0
3 2013.0 1.0 4.0 12.0 -7.2 0.4976 nan 2.2 1.3573296e+12 15709.0
4 2013.0 1.0 5.0 12.0 -7.2 0.426 nan 4.4 1.357416e+12 15710.0
5 2013.0 1.0 6.0 12.0 -1.7 0.6451 nan 4.4 haze 1.3575024e+12 15711.0
6 2013.0 1.0 7.0 12.0 -6.1 0.4119 nan 6.1 1.3575888e+12 15712.0
7 2013.0 1.0 8.0 12.0 -1.7 0.5314 nan 7.2 1.3576752e+12 15713.0
8 2013.0 1.0 9.0 12.0 0.6 0.56 nan 8.9 haze 1.3577616e+12 15714.0
9 2013.0 1.0 10.0 12.0 -6.1 0.3952 nan 6.7 1.357848e+12 15715.0
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Lets now get Days since the epoch... we'll convert year/month/day into Epoch\n", "# time, and then back to Epoch days. Need zero-based month and days, but have\n", "# 1-based.\n", "wthr3[\"msec\"] = h2o.H2OFrame.mktime(year=wthr3[\"Year Local\"], month=wthr3[\"Month Local\"]-1, day=wthr3[\"Day Local\"]-1, hour=wthr3[\"Hour Local\"])\n", "secsPerDay=1000*60*60*24\n", "wthr3[\"Days\"] = (wthr3[\"msec\"]/secsPerDay).floor()\n", "wthr3.describe()\n", "# msec looks sane (numbers like 1.3e12 are in the correct range for msec since\n", "# 1970). Epoch Days matches closely with the epoch day numbers from the\n", "# CitiBike dataset. " ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Lets drop off the extra time columns to make a easy-to-handle dataset.\n", "wthr4 = wthr3.drop(\"Year Local\").drop(\"Month Local\").drop(\"Day Local\").drop(\"Hour Local\").drop(\"msec\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Also, most rain numbers are missing - lets assume those are zero rain days\n", "rain = wthr4[\"Rain (mm)\"]\n", "rain[ rain.isna() ] = 0\n", "wthr4[\"Rain (mm)\"] = rain" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Merge Daily Weather with Bikes-Per-Day\n", "Rows:10,450 Cols:10\n", "\n", "Chunk compression summary:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers6620.625 5.2 KB3.6253278
C0DConstant Reals3310.3125 2.6 KB1.8126639
CBSBits61.8750001 666 B0.4572857
C11-Byte Integers41.25 1.5 KB1.0821055
C1N1-Byte Integers (w/o NAs)288.75 10.8 KB7.599456
C1S1-Byte Fractions319.6875 12.4 KB8.739238
C22-Byte Integers6319.6875 44.3 KB31.169582
CUDUnique Reals8927.812498 64.7 KB45.514343
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ ------- -----------------\n", "C0L Constant Integers 66 20.625 5.2 KB 3.62533\n", "C0D Constant Reals 33 10.3125 2.6 KB 1.81266\n", "CBS Bits 6 1.875 666 B 0.457286\n", "C1 1-Byte Integers 4 1.25 1.5 KB 1.08211\n", "C1N 1-Byte Integers (w/o NAs) 28 8.75 10.8 KB 7.59946\n", "C1S 1-Byte Fractions 31 9.6875 12.4 KB 8.73924\n", "C2 2-Byte Integers 63 19.6875 44.3 KB 31.1696\n", "CUD Unique Reals 89 27.8125 64.7 KB 45.5143" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.61:54321 142.2 KB10450.032.0320.0
mean 142.2 KB10450.032.0320.0
min 142.2 KB10450.032.0320.0
max 142.2 KB10450.032.0320.0
stddev 0 B0.00.00.0
total 142.2 KB10450.032.0320.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- -------- ---------------- ----------------------------- ------------------\n", "172.16.2.61:54321 142.2 KB 10450 32 320\n", "mean 142.2 KB 10450 32 320\n", "min 142.2 KB 10450 32 320\n", "max 142.2 KB 10450 32 320\n", "stddev 0 B 0 0 0\n", "total 142.2 KB 10450 32 320" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Days start station name bikes Month DayOfWeek Humidity Fraction Rain (mm) Temperature (C) WC1 Dew Point (C)
type int enum int enum enum real int real enum real
mins 15979.0 0.0 1.0 0.0 0.0 0.3485 0.0 9.4 2.0 -2.2
mean 15994.4415311NaN 99.30258373210.968612440191NaN 0.562374191388 0.0 16.9630717703 NaN 7.77999043062
maxs 16010.0 329.0 553.0 1.0 6.0 0.8718 0.0 26.1 8.0 19.4
sigma 9.23370172444NaN 72.97219643010.174371128617NaN 0.149631413472 0.0 4.29746634617 NaN 6.49151146664
zeros 0 32 0 328 1635 0 10450 0 -84940
missing0 0 0 0 0 0 0 0 9134 0
0 15979.0 1 Ave & E 15 St 97.0 9 Mon 0.4315 0.0 23.9 10.6
1 15979.0 1 Ave & E 18 St 75.0 9 Mon 0.4315 0.0 23.9 10.6
2 15979.0 1 Ave & E 30 St 113.0 9 Mon 0.4315 0.0 23.9 10.6
3 15979.0 10 Ave & W 28 St 74.0 9 Mon 0.4315 0.0 23.9 10.6
4 15979.0 11 Ave & W 27 St 139.0 9 Mon 0.4315 0.0 23.9 10.6
5 15979.0 11 Ave & W 41 St 60.0 9 Mon 0.4315 0.0 23.9 10.6
6 15979.0 12 Ave & W 40 St 90.0 9 Mon 0.4315 0.0 23.9 10.6
7 15979.0 2 Ave & E 31 St 88.0 9 Mon 0.4315 0.0 23.9 10.6
8 15979.0 2 Ave & E 58 St 55.0 9 Mon 0.4315 0.0 23.9 10.6
9 15979.0 3 Ave & Schermerhorn St8.0 9 Mon 0.4315 0.0 23.9 10.6
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Daysstart station name bikes MonthDayOfWeek Humidity Fraction Rain (mm) Temperature (C)WC1 Dew Point (C)
159791 Ave & E 15 St 97 9Mon 0.4315 0 23.9 10.6
159791 Ave & E 18 St 75 9Mon 0.4315 0 23.9 10.6
159791 Ave & E 30 St 113 9Mon 0.4315 0 23.9 10.6
1597910 Ave & W 28 St 74 9Mon 0.4315 0 23.9 10.6
1597911 Ave & W 27 St 139 9Mon 0.4315 0 23.9 10.6
1597911 Ave & W 41 St 60 9Mon 0.4315 0 23.9 10.6
1597912 Ave & W 40 St 90 9Mon 0.4315 0 23.9 10.6
159792 Ave & E 31 St 88 9Mon 0.4315 0 23.9 10.6
159792 Ave & E 58 St 55 9Mon 0.4315 0 23.9 10.6
159793 Ave & Schermerhorn St 8 9Mon 0.4315 0 23.9 10.6
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# ----------\n", "# 6 - Join the weather data-per-day to the bike-starts-per-day\n", "print(\"Merge Daily Weather with Bikes-Per-Day\")\n", "bpd_with_weather = bpd.merge(wthr4,all_x=True,all_y=False)\n", "bpd_with_weather.describe()\n", "bpd_with_weather.show()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training data has 10 columns and 6289 rows, test has 3080 rows, holdout has 1081\n", "\n", "gbm Model Build Progress: [##################################################] 100%\n", "\n", "drf Model Build Progress: [##################################################] 100%\n", "\n", "glm Model Build Progress: [##################################################] 100%\n", "\n", "deeplearning Model Build Progress: [##################################################] 100%\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
ModelR2 TRAINR2 TESTR2 HOLDOUTModel Training Time (s)
GBM0.99544100.92559620.92300516.706
DRF0.84911250.74302260.74428956.692
GLM0.86605650.84468010.86737050.139
DL 0.96178740.91177930.92134757.972
" ], "text/plain": [ "Model R2 TRAIN R2 TEST R2 HOLDOUT Model Training Time (s)\n", "------- ---------- --------- ------------ -------------------------\n", "GBM 0.995441 0.925596 0.923005 6.706\n", "DRF 0.849112 0.743023 0.744289 6.692\n", "GLM 0.866057 0.84468 0.867371 0.139\n", "DL 0.961787 0.911779 0.921347 7.972" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# 7 - Test/Train split again, model build again, this time with weather\n", "split_fit_predict(bpd_with_weather)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.11" } }, "nbformat": 4, "nbformat_minor": 0 }