{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"import h2o\n",
"import time\n",
"from h2o.estimators.glm import H2OGeneralizedLinearEstimator\n",
"from h2o.estimators.gbm import H2OGradientBoostingEstimator\n",
"from h2o.estimators.random_forest import H2ORandomForestEstimator\n",
"from h2o.estimators.deeplearning import H2ODeepLearningEstimator"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Connecting to H2O server at http://localhost:54321... successful!\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:90: DeprecationWarning: DisplayFormatter._ipython_display_formatter_default is deprecated: use @default decorator instead.\n",
" def _ipython_display_formatter_default(self):\n",
"/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:96: DeprecationWarning: DisplayFormatter._formatters_default is deprecated: use @default decorator instead.\n",
" def _formatters_default(self):\n",
"/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:675: DeprecationWarning: PlainTextFormatter._deferred_printers_default is deprecated: use @default decorator instead.\n",
" def _deferred_printers_default(self):\n",
"/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:667: DeprecationWarning: PlainTextFormatter._singleton_printers_default is deprecated: use @default decorator instead.\n",
" def _singleton_printers_default(self):\n",
"/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:670: DeprecationWarning: PlainTextFormatter._type_printers_default is deprecated: use @default decorator instead.\n",
" def _type_printers_default(self):\n",
"/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:667: DeprecationWarning: PlainTextFormatter._singleton_printers_default is deprecated: use @default decorator instead.\n",
" def _singleton_printers_default(self):\n",
"/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:670: DeprecationWarning: PlainTextFormatter._type_printers_default is deprecated: use @default decorator instead.\n",
" def _type_printers_default(self):\n",
"/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:675: DeprecationWarning: PlainTextFormatter._deferred_printers_default is deprecated: use @default decorator instead.\n",
" def _deferred_printers_default(self):\n"
]
},
{
"data": {
"text/html": [
"
| H2O cluster uptime: | \n",
"2 mins 28 secs |
\n",
"| H2O cluster version: | \n",
"3.9.1.99999 |
\n",
"| H2O cluster name: | \n",
"H2O_from_python_navdeepgill_t343ab |
\n",
"| H2O cluster total nodes: | \n",
"1 |
\n",
"| H2O cluster free memory: | \n",
"3.244 Gb |
\n",
"| H2O cluster total cores: | \n",
"8 |
\n",
"| H2O cluster allowed cores: | \n",
"8 |
\n",
"| H2O cluster is healthy: | \n",
"True |
\n",
"| H2O cluster is locked: | \n",
"True |
\n",
"| H2O connection url: | \n",
"http://localhost:54321 |
\n",
"| H2O connection proxy: | \n",
"None |
\n",
"| Python version: | \n",
"2.7.11 final |
"
],
"text/plain": [
"-------------------------- ----------------------------------\n",
"H2O cluster uptime: 2 mins 28 secs\n",
"H2O cluster version: 3.9.1.99999\n",
"H2O cluster name: H2O_from_python_navdeepgill_t343ab\n",
"H2O cluster total nodes: 1\n",
"H2O cluster free memory: 3.244 Gb\n",
"H2O cluster total cores: 8\n",
"H2O cluster allowed cores: 8\n",
"H2O cluster is healthy: True\n",
"H2O cluster is locked: True\n",
"H2O connection url: http://localhost:54321\n",
"H2O connection proxy:\n",
"Python version: 2.7.11 final\n",
"-------------------------- ----------------------------------"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Explore a typical Data Science workflow with H2O and Python\n",
"#\n",
"# Goal: assist the manager of CitiBike of NYC to load-balance the bicycles\n",
"# across the CitiBike network of stations, by predicting the number of bike\n",
"# trips taken from the station every day. Use 10 million rows of historical\n",
"# data, and eventually add weather data.\n",
"\n",
"\n",
"# Connect to a cluster\n",
"h2o.init()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.\n",
"\n",
"# Set this to True if you want to fetch the data directly from S3.\n",
"# This is useful if your cluster is running in EC2.\n",
"data_source_is_s3 = False\n",
"\n",
"def mylocate(s):\n",
" if data_source_is_s3:\n",
" return \"s3n://h2o-public-test-data/\" + s\n",
" else:\n",
" return _locate(s)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Import and Parse bike data\n",
"Warning: Method get_json in class H2OConnection is deprecated.\n"
]
},
{
"ename": "H2OResponseError",
"evalue": "Server error java.lang.IllegalArgumentException:\n Error: AWS Access Key ID and Secret Access Key must be specified as the username or password (respectively) of a s3n URL, or by setting the fs.s3n.awsAccessKeyId or fs.s3n.awsSecretAccessKey properties (respectively).\n Request: GET /3/ImportFiles\n params: {'path': 's3n://h2o-public-test-data/bigdata/laptop/citibike-nyc/2013-10.csv'}\n",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mH2OResponseError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0;31m# totals about 10 million rows\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;32mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Import and Parse bike data\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 25\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh2o\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mimport_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msmall_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/h2o.pyc\u001b[0m in \u001b[0;36mimport_file\u001b[0;34m(path, destination_frame, parse, header, sep, col_names, col_types, na_strings)\u001b[0m\n\u001b[1;32m 336\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 337\u001b[0m return H2OFrame()._import_parse(path, destination_frame, header, sep, col_names,\n\u001b[0;32m--> 338\u001b[0;31m col_types, na_strings)\n\u001b[0m\u001b[1;32m 339\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 340\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/frame.pyc\u001b[0m in \u001b[0;36m_import_parse\u001b[0;34m(self, path, destination_frame, header, separator, column_names, column_types, na_strings)\u001b[0m\n\u001b[1;32m 196\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 197\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_import_parse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdestination_frame\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseparator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_names\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_types\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mna_strings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 198\u001b[0;31m \u001b[0mrawkey\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh2o\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlazy_import\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 199\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_parse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrawkey\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mdestination_frame\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseparator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_names\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumn_types\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mna_strings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 200\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/h2o.pyc\u001b[0m in \u001b[0;36mlazy_import\u001b[0;34m(path)\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[0mA\u001b[0m \u001b[0mpath\u001b[0m \u001b[0mto\u001b[0m \u001b[0ma\u001b[0m \u001b[0mdata\u001b[0m \u001b[0mfile\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mremote\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mlocal\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 214\u001b[0m \"\"\"\n\u001b[0;32m--> 215\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0m_import\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mp\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mlist\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtuple\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0m_import\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 216\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 217\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/h2o.pyc\u001b[0m in \u001b[0;36m_import\u001b[0;34m(path)\u001b[0m\n\u001b[1;32m 217\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_import\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 219\u001b[0;31m \u001b[0mj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh2oconn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl_suffix\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"ImportFiles\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 220\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'fails'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"ImportFiles of \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mpath\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\" failed on \"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mj\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'fails'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 221\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mj\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'destination_frames'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/utils/backward_compatibility.pyc\u001b[0m in \u001b[0;36m\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 64\u001b[0m self._bcin = {\n\u001b[1;32m 65\u001b[0m \u001b[0;31m# Creating lambdas in a loop, need to make sure that `fun` is bound to each lambda separately.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 66\u001b[0;31m \u001b[0mname\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mfun\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mfun\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfun\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 67\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfun\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mviewitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_bc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"im\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 68\u001b[0m }\n",
"\u001b[0;32m/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/connection.pyc\u001b[0m in \u001b[0;36m\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 665\u001b[0m \u001b[0;34m\"post\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0m_deprecated_post\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 666\u001b[0m \u001b[0;34m\"delete\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0m_deprecated_delete\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 667\u001b[0;31m \u001b[0;34m\"get_json\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0m_deprecated_get\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 668\u001b[0m \u001b[0;34m\"post_json\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0m_deprecated_post\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 669\u001b[0m }\n",
"\u001b[0;32m/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/connection.pyc\u001b[0m in \u001b[0;36m_deprecated_get\u001b[0;34m(self, url_suffix, **kwargs)\u001b[0m\n\u001b[1;32m 1155\u001b[0m \u001b[0mrestver\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"_rest_version\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m\"_rest_version\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1156\u001b[0m \u001b[0mendpoint\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"GET /%d/%s\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrestver\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl_suffix\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1157\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mendpoint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1159\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_deprecated_post\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl_suffix\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/connection.pyc\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, endpoint, data, json, filename)\u001b[0m\n\u001b[1;32m 232\u001b[0m auth=self._auth, verify=self._verify_ssl_cert, proxies=self._proxies)\n\u001b[1;32m 233\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_log_end_transaction\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstart_time\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 234\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_process_response\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresp\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 235\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 236\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexceptions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mConnectionError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexceptions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mHTTPError\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/connection.pyc\u001b[0m in \u001b[0;36m_process_response\u001b[0;34m(response)\u001b[0m\n\u001b[1;32m 585\u001b[0m \u001b[0;31m# Client errors (400 = \"Bad Request\", 404 = \"Not Found\", 412 = \"Precondition Failed\")\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 586\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mstatus_code\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;36m400\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m404\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m412\u001b[0m\u001b[0;34m}\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mH2OErrorV3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mH2OModelBuilderErrorV3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 587\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mH2OResponseError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 588\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 589\u001b[0m \u001b[0;31m# Server errors (notably 500 = \"Server Error\")\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mH2OResponseError\u001b[0m: Server error java.lang.IllegalArgumentException:\n Error: AWS Access Key ID and Secret Access Key must be specified as the username or password (respectively) of a s3n URL, or by setting the fs.s3n.awsAccessKeyId or fs.s3n.awsSecretAccessKey properties (respectively).\n Request: GET /3/ImportFiles\n params: {'path': 's3n://h2o-public-test-data/bigdata/laptop/citibike-nyc/2013-10.csv'}\n"
]
}
],
"source": [
"# Pick either the big or the small demo.\n",
"# Big data is 10M rows\n",
"small_test = [mylocate(\"bigdata/laptop/citibike-nyc/2013-10.csv\")]\n",
"big_test = [mylocate(\"bigdata/laptop/citibike-nyc/2013-07.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2013-08.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2013-09.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2013-10.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2013-11.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2013-12.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-01.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-02.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-03.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-04.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-05.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-06.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-07.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-08.csv\")]\n",
"\n",
"# ----------\n",
"\n",
"# 1- Load data - 1 row per bicycle trip. Has columns showing the start and end\n",
"# station, trip duration and trip start time and day. The larger dataset\n",
"# totals about 10 million rows\n",
"print(\"Import and Parse bike data\")\n",
"data = h2o.import_file(path=small_test)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Rows:1,037,712 Cols:16\n",
"\n",
"Chunk compression summary:\n"
]
},
{
"data": {
"text/html": [
"| chunk_type | \n",
"chunk_name | \n",
"count | \n",
"count_percentage | \n",
"size | \n",
"size_percentage |
\n",
"| C0L | \n",
"Constant Integers | \n",
"17 | \n",
"2.2135415 | \n",
" 1.3 KB | \n",
"0.0022872 |
\n",
"| C1 | \n",
"1-Byte Integers | \n",
"48 | \n",
"6.25 | \n",
" 1016.6 KB | \n",
"1.7506603 |
\n",
"| C1N | \n",
"1-Byte Integers (w/o NAs) | \n",
"48 | \n",
"6.25 | \n",
" 1016.6 KB | \n",
"1.7506603 |
\n",
"| C1S | \n",
"1-Byte Fractions | \n",
"79 | \n",
"10.286459 | \n",
" 1.6 MB | \n",
"2.8878725 |
\n",
"| C2 | \n",
"2-Byte Integers | \n",
"243 | \n",
"31.640625 | \n",
" 10.0 MB | \n",
"17.696283 |
\n",
"| C2S | \n",
"2-Byte Fractions | \n",
"49 | \n",
"6.3802085 | \n",
" 2.0 MB | \n",
"3.5701983 |
\n",
"| C4 | \n",
"4-Byte Integers | \n",
"32 | \n",
"4.166667 | \n",
" 2.6 MB | \n",
"4.6726856 |
\n",
"| C4S | \n",
"4-Byte Fractions | \n",
"39 | \n",
"5.078125 | \n",
" 3.2 MB | \n",
"5.6373096 |
\n",
"| C8 | \n",
"64-bit Integers | \n",
"60 | \n",
"7.8125 | \n",
" 9.9 MB | \n",
"17.432673 |
\n",
"| C8D | \n",
"64-bit Reals | \n",
"153 | \n",
"19.921875 | \n",
" 25.3 MB | \n",
"44.59937 |
"
],
"text/plain": [
"chunk_type chunk_name count count_percentage size size_percentage\n",
"------------ ------------------------- ------- ------------------ --------- -----------------\n",
"C0L Constant Integers 17 2.21354 1.3 KB 0.00228718\n",
"C1 1-Byte Integers 48 6.25 1016.6 KB 1.75066\n",
"C1N 1-Byte Integers (w/o NAs) 48 6.25 1016.6 KB 1.75066\n",
"C1S 1-Byte Fractions 79 10.2865 1.6 MB 2.88787\n",
"C2 2-Byte Integers 243 31.6406 10.0 MB 17.6963\n",
"C2S 2-Byte Fractions 49 6.38021 2.0 MB 3.5702\n",
"C4 4-Byte Integers 32 4.16667 2.6 MB 4.67269\n",
"C4S 4-Byte Fractions 39 5.07812 3.2 MB 5.63731\n",
"C8 64-bit Integers 60 7.8125 9.9 MB 17.4327\n",
"C8D 64-bit Reals 153 19.9219 25.3 MB 44.5994"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Frame distribution summary:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"size | \n",
"number_of_rows | \n",
"number_of_chunks_per_column | \n",
"number_of_chunks |
\n",
"| 172.16.2.61:54321 | \n",
" 56.7 MB | \n",
"1037712.0 | \n",
"48.0 | \n",
"768.0 |
\n",
"| mean | \n",
" 56.7 MB | \n",
"1037712.0 | \n",
"48.0 | \n",
"768.0 |
\n",
"| min | \n",
" 56.7 MB | \n",
"1037712.0 | \n",
"48.0 | \n",
"768.0 |
\n",
"| max | \n",
" 56.7 MB | \n",
"1037712.0 | \n",
"48.0 | \n",
"768.0 |
\n",
"| stddev | \n",
" 0 B | \n",
"0.0 | \n",
"0.0 | \n",
"0.0 |
\n",
"| total | \n",
" 56.7 MB | \n",
"1037712.0 | \n",
"48.0 | \n",
"768.0 |
"
],
"text/plain": [
" size number_of_rows number_of_chunks_per_column number_of_chunks\n",
"----------------- ------- ---------------- ----------------------------- ------------------\n",
"172.16.2.61:54321 56.7 MB 1.03771e+06 48 768\n",
"mean 56.7 MB 1.03771e+06 48 768\n",
"min 56.7 MB 1.03771e+06 48 768\n",
"max 56.7 MB 1.03771e+06 48 768\n",
"stddev 0 B 0 0 0\n",
"total 56.7 MB 1.03771e+06 48 768"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"| | tripduration | starttime | stoptime | start station id | start station name | start station latitude | start station longitude | end station id | end station name | end station latitude | end station longitude | bikeid | usertype | birth year | gender | Days |
\n",
"| type | int | time | time | int | enum | real | real | int | enum | real | real | int | enum | int | int | int |
\n",
"| mins | 60.0 | 1.380610868e+12 | 1.380611083e+12 | 72.0 | 0.0 | 40.680342423 | -74.01713445 | 72.0 | 0.0 | 40.680342423 | -74.01713445 | 14529.0 | 0.0 | 1899.0 | 0.0 | 15979.0 |
\n",
"| mean | 825.614754383 | 1.38191371692e+12 | 1.38191454253e+12 | 443.714212614 | NaN | 40.7345188586 | -73.9911328848 | 443.207421712 | NaN | 40.7342847885 | -73.9912702982 | 17644.0716451 | 0.906095332809 | 1975.77839486 | 1.12375591686 | 15993.8523906 |
\n",
"| maxs | 1259480.0 | 1.383289197e+12 | 1.38341851e+12 | 3002.0 | 329.0 | 40.770513 | -73.9500479759 | 3002.0 | 329.0 | 40.770513 | -73.9500479759 | 20757.0 | 1.0 | 1997.0 | 2.0 | 16010.0 |
\n",
"| sigma | 2000.3732323 | 778871729.132 | 778847387.503 | 354.434325075 | NaN | 0.0195734073053 | 0.0123161234106 | 357.398217058 | NaN | 0.0195578458116 | 0.0123855811965 | 1717.68112134 | 0.291696182123 | 11.1314906238 | 0.544380593291 | 9.02215033588 |
\n",
"| zeros | 0 | 0 | 0 | 0 | 5239 | 0 | 0 | 0 | 5449 | 0 | 0 | 0 | 97446 | 0 | 97498 | 0 |
\n",
"| missing | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 97445 | 0 | 0 |
\n",
"| 0 | 326.0 | 1.380610868e+12 | 1.380611194e+12 | 239.0 | Willoughby St & Fleet St | 40.69196566 | -73.9813018 | 366.0 | Clinton Ave & Myrtle Ave | 40.693261 | -73.968896 | 16052.0 | Subscriber | 1982.0 | 1.0 | 15979.0 |
\n",
"| 1 | 729.0 | 1.380610881e+12 | 1.38061161e+12 | 322.0 | Clinton St & Tillary St | 40.696192 | -73.991218 | 398.0 | Atlantic Ave & Furman St | 40.69165183 | -73.9999786 | 19412.0 | Customer | nan | 0.0 | 15979.0 |
\n",
"| 2 | 520.0 | 1.380610884e+12 | 1.380611404e+12 | 174.0 | E 25 St & 1 Ave | 40.7381765 | -73.97738662 | 403.0 | E 2 St & 2 Ave | 40.72502876 | -73.99069656 | 19645.0 | Subscriber | 1984.0 | 1.0 | 15979.0 |
\n",
"| 3 | 281.0 | 1.380610885e+12 | 1.380611166e+12 | 430.0 | York St & Jay St | 40.7014851 | -73.98656928 | 323.0 | Lawrence St & Willoughby St | 40.69236178 | -73.98631746 | 16992.0 | Subscriber | 1985.0 | 1.0 | 15979.0 |
\n",
"| 4 | 196.0 | 1.380610887e+12 | 1.380611083e+12 | 403.0 | E 2 St & 2 Ave | 40.72502876 | -73.99069656 | 401.0 | Allen St & Rivington St | 40.72019576 | -73.98997825 | 15690.0 | Subscriber | 1986.0 | 1.0 | 15979.0 |
\n",
"| 5 | 1948.0 | 1.380610908e+12 | 1.380612856e+12 | 369.0 | Washington Pl & 6 Ave | 40.73224119 | -74.00026394 | 307.0 | Canal St & Rutgers St | 40.71427487 | -73.98990025 | 19846.0 | Subscriber | 1977.0 | 1.0 | 15979.0 |
\n",
"| 6 | 1327.0 | 1.380610908e+12 | 1.380612235e+12 | 254.0 | W 11 St & 6 Ave | 40.73532427 | -73.99800419 | 539.0 | Metropolitan Ave & Bedford Ave | 40.71534825 | -73.96024116 | 14563.0 | Subscriber | 1986.0 | 2.0 | 15979.0 |
\n",
"| 7 | 1146.0 | 1.380610917e+12 | 1.380612063e+12 | 490.0 | 8 Ave & W 33 St | 40.751551 | -73.993934 | 438.0 | St Marks Pl & 1 Ave | 40.72779126 | -73.98564945 | 16793.0 | Subscriber | 1959.0 | 1.0 | 15979.0 |
\n",
"| 8 | 380.0 | 1.380610918e+12 | 1.380611298e+12 | 468.0 | Broadway & W 55 St | 40.7652654 | -73.98192338 | 385.0 | E 55 St & 2 Ave | 40.75797322 | -73.96603308 | 16600.0 | Customer | nan | 0.0 | 15979.0 |
\n",
"| 9 | 682.0 | 1.380610925e+12 | 1.380611607e+12 | 300.0 | Shevchenko Pl & E 6 St | 40.728145 | -73.990214 | 519.0 | Pershing Square N | 40.75188406 | -73.97770164 | 15204.0 | Subscriber | 1992.0 | 1.0 | 15979.0 |
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# ----------\n",
"\n",
"# 2- light data munging: group the bike starts per-day, converting the 10M rows\n",
"# of trips to about 140,000 station&day combos - predicting the number of trip\n",
"# starts per-station-per-day.\n",
"\n",
"# Convert start time to: Day since the Epoch\n",
"startime = data[\"starttime\"]\n",
"secsPerDay=1000*60*60*24\n",
"data[\"Days\"] = (startime/secsPerDay).floor()\n",
"data.describe()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"| Days | start station name | bikes |
\n",
"| 15979 | 1 Ave & E 15 St | 97 |
\n",
"| 15979 | 1 Ave & E 18 St | 75 |
\n",
"| 15979 | 1 Ave & E 30 St | 113 |
\n",
"| 15979 | 10 Ave & W 28 St | 74 |
\n",
"| 15979 | 11 Ave & W 27 St | 139 |
\n",
"| 15979 | 11 Ave & W 41 St | 60 |
\n",
"| 15979 | 12 Ave & W 40 St | 90 |
\n",
"| 15979 | 2 Ave & E 31 St | 88 |
\n",
"| 15979 | 2 Ave & E 58 St | 55 |
\n",
"| 15979 | 3 Ave & Schermerhorn St | 8 |
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Rows:10,450 Cols:3\n",
"\n",
"Chunk compression summary:\n"
]
},
{
"data": {
"text/html": [
"| chunk_type | \n",
"chunk_name | \n",
"count | \n",
"count_percentage | \n",
"size | \n",
"size_percentage |
\n",
"| C0L | \n",
"Constant Integers | \n",
"1 | \n",
"1.0416667 | \n",
" 80 B | \n",
"0.1364815 |
\n",
"| C1N | \n",
"1-Byte Integers (w/o NAs) | \n",
"1 | \n",
"1.0416667 | \n",
" 412 B | \n",
"0.7028798 |
\n",
"| C1S | \n",
"1-Byte Fractions | \n",
"31 | \n",
"32.291664 | \n",
" 12.4 KB | \n",
"21.714207 |
\n",
"| C2 | \n",
"2-Byte Integers | \n",
"63 | \n",
"65.625 | \n",
" 44.3 KB | \n",
"77.446434 |
"
],
"text/plain": [
"chunk_type chunk_name count count_percentage size size_percentage\n",
"------------ ------------------------- ------- ------------------ ------- -----------------\n",
"C0L Constant Integers 1 1.04167 80 B 0.136482\n",
"C1N 1-Byte Integers (w/o NAs) 1 1.04167 412 B 0.70288\n",
"C1S 1-Byte Fractions 31 32.2917 12.4 KB 21.7142\n",
"C2 2-Byte Integers 63 65.625 44.3 KB 77.4464"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Frame distribution summary:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"size | \n",
"number_of_rows | \n",
"number_of_chunks_per_column | \n",
"number_of_chunks |
\n",
"| 172.16.2.61:54321 | \n",
" 57.2 KB | \n",
"10450.0 | \n",
"32.0 | \n",
"96.0 |
\n",
"| mean | \n",
" 57.2 KB | \n",
"10450.0 | \n",
"32.0 | \n",
"96.0 |
\n",
"| min | \n",
" 57.2 KB | \n",
"10450.0 | \n",
"32.0 | \n",
"96.0 |
\n",
"| max | \n",
" 57.2 KB | \n",
"10450.0 | \n",
"32.0 | \n",
"96.0 |
\n",
"| stddev | \n",
" 0 B | \n",
"0.0 | \n",
"0.0 | \n",
"0.0 |
\n",
"| total | \n",
" 57.2 KB | \n",
"10450.0 | \n",
"32.0 | \n",
"96.0 |
"
],
"text/plain": [
" size number_of_rows number_of_chunks_per_column number_of_chunks\n",
"----------------- ------- ---------------- ----------------------------- ------------------\n",
"172.16.2.61:54321 57.2 KB 10450 32 96\n",
"mean 57.2 KB 10450 32 96\n",
"min 57.2 KB 10450 32 96\n",
"max 57.2 KB 10450 32 96\n",
"stddev 0 B 0 0 0\n",
"total 57.2 KB 10450 32 96"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"| | Days | start station name | bikes |
\n",
"| type | int | enum | int |
\n",
"| mins | 15979.0 | 0.0 | 1.0 |
\n",
"| mean | 15994.4415311 | NaN | 99.3025837321 |
\n",
"| maxs | 16010.0 | 329.0 | 553.0 |
\n",
"| sigma | 9.23370172444 | NaN | 72.9721964301 |
\n",
"| zeros | 0 | 32 | 0 |
\n",
"| missing | 0 | 0 | 0 |
\n",
"| 0 | 15979.0 | 1 Ave & E 15 St | 97.0 |
\n",
"| 1 | 15979.0 | 1 Ave & E 18 St | 75.0 |
\n",
"| 2 | 15979.0 | 1 Ave & E 30 St | 113.0 |
\n",
"| 3 | 15979.0 | 10 Ave & W 28 St | 74.0 |
\n",
"| 4 | 15979.0 | 11 Ave & W 27 St | 139.0 |
\n",
"| 5 | 15979.0 | 11 Ave & W 41 St | 60.0 |
\n",
"| 6 | 15979.0 | 12 Ave & W 40 St | 90.0 |
\n",
"| 7 | 15979.0 | 2 Ave & E 31 St | 88.0 |
\n",
"| 8 | 15979.0 | 2 Ave & E 58 St | 55.0 |
\n",
"| 9 | 15979.0 | 3 Ave & Schermerhorn St | 8.0 |
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"[10450, 3]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Now do a monster Group-By. Count bike starts per-station per-day. Ends up\n",
"# with about 340 stations times 400 days (140,000 rows). This is what we want\n",
"# to predict.\n",
"grouped = data.group_by([\"Days\",\"start station name\"])\n",
"bpd = grouped.count().get_frame() # Compute bikes-per-day\n",
"bpd.set_name(2,\"bikes\")\n",
"bpd.show()\n",
"bpd.describe()\n",
"bpd.dim"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quantiles of bikes-per-day\n"
]
},
{
"data": {
"text/html": [
"\n",
"| Probs | bikesQuantiles |
\n",
"| 0.01 | 4.49 |
\n",
"| 0.1 | 19 |
\n",
"| 0.25 | 43 |
\n",
"| 0.333 | 57 |
\n",
"| 0.5 | 87 |
\n",
"| 0.667 | 118 |
\n",
"| 0.75 | 137 |
\n",
"| 0.9 | 192 |
\n",
"| 0.99 | 334.51 |
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Quantiles: the data is fairly unbalanced; some station/day combos are wildly\n",
"# more popular than others.\n",
"print(\"Quantiles of bikes-per-day\")\n",
"bpd[\"bikes\"].quantile().show()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Bikes-Per-Day\n",
"Rows:10,450 Cols:5\n",
"\n",
"Chunk compression summary:\n"
]
},
{
"data": {
"text/html": [
"| chunk_type | \n",
"chunk_name | \n",
"count | \n",
"count_percentage | \n",
"size | \n",
"size_percentage |
\n",
"| C0L | \n",
"Constant Integers | \n",
"33 | \n",
"20.625 | \n",
" 2.6 KB | \n",
"3.6613781 |
\n",
"| CBS | \n",
"Bits | \n",
"6 | \n",
"3.7500002 | \n",
" 666 B | \n",
"0.9236658 |
\n",
"| C1N | \n",
"1-Byte Integers (w/o NAs) | \n",
"27 | \n",
"16.875 | \n",
" 10.4 KB | \n",
"14.803617 |
\n",
"| C1S | \n",
"1-Byte Fractions | \n",
"31 | \n",
"19.375 | \n",
" 12.4 KB | \n",
"17.65228 |
\n",
"| C2 | \n",
"2-Byte Integers | \n",
"63 | \n",
"39.375 | \n",
" 44.3 KB | \n",
"62.959057 |
"
],
"text/plain": [
"chunk_type chunk_name count count_percentage size size_percentage\n",
"------------ ------------------------- ------- ------------------ ------- -----------------\n",
"C0L Constant Integers 33 20.625 2.6 KB 3.66138\n",
"CBS Bits 6 3.75 666 B 0.923666\n",
"C1N 1-Byte Integers (w/o NAs) 27 16.875 10.4 KB 14.8036\n",
"C1S 1-Byte Fractions 31 19.375 12.4 KB 17.6523\n",
"C2 2-Byte Integers 63 39.375 44.3 KB 62.9591"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Frame distribution summary:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"size | \n",
"number_of_rows | \n",
"number_of_chunks_per_column | \n",
"number_of_chunks |
\n",
"| 172.16.2.61:54321 | \n",
" 70.4 KB | \n",
"10450.0 | \n",
"32.0 | \n",
"160.0 |
\n",
"| mean | \n",
" 70.4 KB | \n",
"10450.0 | \n",
"32.0 | \n",
"160.0 |
\n",
"| min | \n",
" 70.4 KB | \n",
"10450.0 | \n",
"32.0 | \n",
"160.0 |
\n",
"| max | \n",
" 70.4 KB | \n",
"10450.0 | \n",
"32.0 | \n",
"160.0 |
\n",
"| stddev | \n",
" 0 B | \n",
"0.0 | \n",
"0.0 | \n",
"0.0 |
\n",
"| total | \n",
" 70.4 KB | \n",
"10450.0 | \n",
"32.0 | \n",
"160.0 |
"
],
"text/plain": [
" size number_of_rows number_of_chunks_per_column number_of_chunks\n",
"----------------- ------- ---------------- ----------------------------- ------------------\n",
"172.16.2.61:54321 70.4 KB 10450 32 160\n",
"mean 70.4 KB 10450 32 160\n",
"min 70.4 KB 10450 32 160\n",
"max 70.4 KB 10450 32 160\n",
"stddev 0 B 0 0 0\n",
"total 70.4 KB 10450 32 160"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"| | Days | start station name | bikes | Month | DayOfWeek |
\n",
"| type | int | enum | int | enum | enum |
\n",
"| mins | 15979.0 | 0.0 | 1.0 | 0.0 | 0.0 |
\n",
"| mean | 15994.4415311 | NaN | 99.3025837321 | 0.968612440191 | NaN |
\n",
"| maxs | 16010.0 | 329.0 | 553.0 | 1.0 | 6.0 |
\n",
"| sigma | 9.23370172444 | NaN | 72.9721964301 | 0.174371128617 | NaN |
\n",
"| zeros | 0 | 32 | 0 | 328 | 1635 |
\n",
"| missing | 0 | 0 | 0 | 0 | 0 |
\n",
"| 0 | 15979.0 | 1 Ave & E 15 St | 97.0 | 9 | Mon |
\n",
"| 1 | 15979.0 | 1 Ave & E 18 St | 75.0 | 9 | Mon |
\n",
"| 2 | 15979.0 | 1 Ave & E 30 St | 113.0 | 9 | Mon |
\n",
"| 3 | 15979.0 | 10 Ave & W 28 St | 74.0 | 9 | Mon |
\n",
"| 4 | 15979.0 | 11 Ave & W 27 St | 139.0 | 9 | Mon |
\n",
"| 5 | 15979.0 | 11 Ave & W 41 St | 60.0 | 9 | Mon |
\n",
"| 6 | 15979.0 | 12 Ave & W 40 St | 90.0 | 9 | Mon |
\n",
"| 7 | 15979.0 | 2 Ave & E 31 St | 88.0 | 9 | Mon |
\n",
"| 8 | 15979.0 | 2 Ave & E 58 St | 55.0 | 9 | Mon |
\n",
"| 9 | 15979.0 | 3 Ave & Schermerhorn St | 8.0 | 9 | Mon |
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# A little feature engineering\n",
"# Add in month-of-year (seasonality; fewer bike rides in winter than summer)\n",
"secs = bpd[\"Days\"]*secsPerDay\n",
"bpd[\"Month\"] = secs.month().asfactor()\n",
"# Add in day-of-week (work-week; more bike rides on Sunday than Monday)\n",
"bpd[\"DayOfWeek\"] = secs.dayOfWeek()\n",
"print(\"Bikes-Per-Day\")\n",
"bpd.describe()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# ----------\n",
"# 3- Fit a model on train; using test as validation\n",
"\n",
"# Function for doing class test/train/holdout split\n",
"def split_fit_predict(data):\n",
" global gbm0,drf0,glm0,dl0\n",
" # Classic Test/Train split\n",
" r = data['Days'].runif() # Random UNIForm numbers, one per row\n",
" train = data[ r < 0.6]\n",
" test = data[(0.6 <= r) & (r < 0.9)]\n",
" hold = data[ 0.9 <= r ]\n",
" print(\"Training data has\",train.ncol,\"columns and\",train.nrow,\"rows, test has\",test.nrow,\"rows, holdout has\",hold.nrow)\n",
" bike_names_x = data.names\n",
" bike_names_x.remove(\"bikes\")\n",
" \n",
" # Run GBM\n",
" s = time.time()\n",
" \n",
" gbm0 = H2OGradientBoostingEstimator(ntrees=500, # 500 works well\n",
" max_depth=6,\n",
" learn_rate=0.1)\n",
" \n",
"\n",
" gbm0.train(x =bike_names_x,\n",
" y =\"bikes\",\n",
" training_frame =train,\n",
" validation_frame=test)\n",
"\n",
" gbm_elapsed = time.time() - s\n",
"\n",
" # Run DRF\n",
" s = time.time()\n",
" \n",
" drf0 = H2ORandomForestEstimator(ntrees=250, max_depth=30)\n",
"\n",
" drf0.train(x =bike_names_x,\n",
" y =\"bikes\",\n",
" training_frame =train,\n",
" validation_frame=test)\n",
" \n",
" drf_elapsed = time.time() - s \n",
" \n",
" \n",
" # Run GLM\n",
" if \"WC1\" in bike_names_x: bike_names_x.remove(\"WC1\")\n",
" s = time.time()\n",
"\n",
" glm0 = H2OGeneralizedLinearEstimator(Lambda=[1e-5], family=\"poisson\")\n",
" \n",
" glm0.train(x =bike_names_x,\n",
" y =\"bikes\",\n",
" training_frame =train,\n",
" validation_frame=test)\n",
"\n",
" glm_elapsed = time.time() - s\n",
" \n",
" # Run DL\n",
" s = time.time()\n",
"\n",
" dl0 = H2ODeepLearningEstimator(hidden=[50,50,50,50], epochs=50)\n",
" \n",
" dl0.train(x =bike_names_x,\n",
" y =\"bikes\",\n",
" training_frame =train,\n",
" validation_frame=test)\n",
" \n",
" dl_elapsed = time.time() - s\n",
" \n",
" # ----------\n",
" # 4- Score on holdout set & report\n",
" train_mse_gbm = gbm0.model_performance(train).mse()\n",
" test_mse_gbm = gbm0.model_performance(test ).mse()\n",
" hold_mse_gbm = gbm0.model_performance(hold ).mse()\n",
"# print \"GBM mse TRAIN=\",train_mse_gbm,\", mse TEST=\",test_mse_gbm,\", mse HOLDOUT=\",hold_mse_gbm\n",
" \n",
" train_mse_drf = drf0.model_performance(train).mse()\n",
" test_mse_drf = drf0.model_performance(test ).mse()\n",
" hold_mse_drf = drf0.model_performance(hold ).mse()\n",
"# print \"DRF mse TRAIN=\",train_mse_drf,\", mse TEST=\",test_mse_drf,\", mse HOLDOUT=\",hold_mse_drf\n",
" \n",
" train_mse_glm = glm0.model_performance(train).mse()\n",
" test_mse_glm = glm0.model_performance(test ).mse()\n",
" hold_mse_glm = glm0.model_performance(hold ).mse()\n",
"# print \"GLM mse TRAIN=\",train_mse_glm,\", mse TEST=\",test_mse_glm,\", mse HOLDOUT=\",hold_mse_glm\n",
" \n",
" train_mse_dl = dl0.model_performance(train).mse()\n",
" test_mse_dl = dl0.model_performance(test ).mse()\n",
" hold_mse_dl = dl0.model_performance(hold ).mse()\n",
"# print \" DL mse TRAIN=\",train_mse_dl,\", mse TEST=\",test_mse_dl,\", mse HOLDOUT=\",hold_mse_dl\n",
" \n",
" # make a pretty HTML table printout of the results\n",
"\n",
" header = [\"Model\", \"mse TRAIN\", \"mse TEST\", \"mse HOLDOUT\", \"Model Training Time (s)\"]\n",
" table = [\n",
" [\"GBM\", train_mse_gbm, test_mse_gbm, hold_mse_gbm, round(gbm_elapsed,3)],\n",
" [\"DRF\", train_mse_drf, test_mse_drf, hold_mse_drf, round(drf_elapsed,3)],\n",
" [\"GLM\", train_mse_glm, test_mse_glm, hold_mse_glm, round(glm_elapsed,3)],\n",
" [\"DL \", train_mse_dl, test_mse_dl, hold_mse_dl , round(dl_elapsed,3) ],\n",
" ]\n",
" h2o.display.H2ODisplay(table,header)\n",
" # --------------"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training data has 5 columns and 6172 rows, test has 3238 rows, holdout has 1040\n",
"\n",
"gbm Model Build Progress: [##################################################] 100%\n",
"\n",
"drf Model Build Progress: [##################################################] 100%\n",
"\n",
"glm Model Build Progress: [##################################################] 100%\n",
"\n",
"deeplearning Model Build Progress: [##################################################] 100%\n"
]
},
{
"data": {
"text/html": [
"| Model | \n",
"R2 TRAIN | \n",
"R2 TEST | \n",
"R2 HOLDOUT | \n",
"Model Training Time (s) |
\n",
"| GBM | \n",
"0.9976981 | \n",
"0.9274821 | \n",
"0.9183267 | \n",
"5.612 |
\n",
"| DRF | \n",
"0.8294274 | \n",
"0.7694496 | \n",
"0.7611063 | \n",
"5.607 |
\n",
"| GLM | \n",
"0.8597208 | \n",
"0.8465429 | \n",
"0.8447966 | \n",
"0.14 |
\n",
"| DL | \n",
"0.9546943 | \n",
"0.9115880 | \n",
"0.8978001 | \n",
"6.845 |
"
],
"text/plain": [
"Model R2 TRAIN R2 TEST R2 HOLDOUT Model Training Time (s)\n",
"------- ---------- --------- ------------ -------------------------\n",
"GBM 0.997698 0.927482 0.918327 5.612\n",
"DRF 0.829427 0.76945 0.761106 5.607\n",
"GLM 0.859721 0.846543 0.844797 0.14\n",
"DL 0.954694 0.911588 0.8978 6.845"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Split the data (into test & train), fit some models and predict on the holdout data\n",
"split_fit_predict(bpd)\n",
"# Here we see an r^2 of 0.91 for GBM, and 0.71 for GLM. This means given just\n",
"# the station, the month, and the day-of-week we can predict 90% of the\n",
"# variance of the bike-trip-starts."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Parse Progress: [##################################################] 100%\n",
"Rows:17,520 Cols:50\n",
"\n",
"Chunk compression summary:\n"
]
},
{
"data": {
"text/html": [
"| chunk_type | \n",
"chunk_name | \n",
"count | \n",
"count_percentage | \n",
"size | \n",
"size_percentage |
\n",
"| C0L | \n",
"Constant Integers | \n",
"107 | \n",
"6.294118 | \n",
" 8.4 KB | \n",
"0.7889721 |
\n",
"| C0D | \n",
"Constant Reals | \n",
"436 | \n",
"25.647058 | \n",
" 34.1 KB | \n",
"3.2148771 |
\n",
"| CXI | \n",
"Sparse Integers | \n",
"17 | \n",
"1.0 | \n",
" 1.5 KB | \n",
"0.1399135 |
\n",
"| C1 | \n",
"1-Byte Integers | \n",
"346 | \n",
"20.352942 | \n",
" 197.4 KB | \n",
"18.634672 |
\n",
"| C1N | \n",
"1-Byte Integers (w/o NAs) | \n",
"214 | \n",
"12.588236 | \n",
" 122.3 KB | \n",
"11.544063 |
\n",
"| C1S | \n",
"1-Byte Fractions | \n",
"214 | \n",
"12.588236 | \n",
" 125.3 KB | \n",
"11.822968 |
\n",
"| C2S | \n",
"2-Byte Fractions | \n",
"196 | \n",
"11.529412 | \n",
" 214.5 KB | \n",
"20.242111 |
\n",
"| C4S | \n",
"4-Byte Fractions | \n",
"170 | \n",
"10.0 | \n",
" 356.1 KB | \n",
"33.612423 |
"
],
"text/plain": [
"chunk_type chunk_name count count_percentage size size_percentage\n",
"------------ ------------------------- ------- ------------------ -------- -----------------\n",
"C0L Constant Integers 107 6.29412 8.4 KB 0.788972\n",
"C0D Constant Reals 436 25.6471 34.1 KB 3.21488\n",
"CXI Sparse Integers 17 1 1.5 KB 0.139914\n",
"C1 1-Byte Integers 346 20.3529 197.4 KB 18.6347\n",
"C1N 1-Byte Integers (w/o NAs) 214 12.5882 122.3 KB 11.5441\n",
"C1S 1-Byte Fractions 214 12.5882 125.3 KB 11.823\n",
"C2S 2-Byte Fractions 196 11.5294 214.5 KB 20.2421\n",
"C4S 4-Byte Fractions 170 10 356.1 KB 33.6124"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Frame distribution summary:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"size | \n",
"number_of_rows | \n",
"number_of_chunks_per_column | \n",
"number_of_chunks |
\n",
"| 172.16.2.61:54321 | \n",
" 1.0 MB | \n",
"17520.0 | \n",
"34.0 | \n",
"1700.0 |
\n",
"| mean | \n",
" 1.0 MB | \n",
"17520.0 | \n",
"34.0 | \n",
"1700.0 |
\n",
"| min | \n",
" 1.0 MB | \n",
"17520.0 | \n",
"34.0 | \n",
"1700.0 |
\n",
"| max | \n",
" 1.0 MB | \n",
"17520.0 | \n",
"34.0 | \n",
"1700.0 |
\n",
"| stddev | \n",
" 0 B | \n",
"0.0 | \n",
"0.0 | \n",
"0.0 |
\n",
"| total | \n",
" 1.0 MB | \n",
"17520.0 | \n",
"34.0 | \n",
"1700.0 |
"
],
"text/plain": [
" size number_of_rows number_of_chunks_per_column number_of_chunks\n",
"----------------- ------ ---------------- ----------------------------- ------------------\n",
"172.16.2.61:54321 1.0 MB 17520 34 1700\n",
"mean 1.0 MB 17520 34 1700\n",
"min 1.0 MB 17520 34 1700\n",
"max 1.0 MB 17520 34 1700\n",
"stddev 0 B 0 0 0\n",
"total 1.0 MB 17520 34 1700"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"| | Year Local | Month Local | Day Local | Hour Local | Year UTC | Month UTC | Day UTC | Hour UTC | Cavok Reported | Cloud Ceiling (m) | Cloud Cover Fraction | Cloud Cover Fraction 1 | Cloud Cover Fraction 2 | Cloud Cover Fraction 3 | Cloud Cover Fraction 4 | Cloud Cover Fraction 5 | Cloud Cover Fraction 6 | Cloud Height (m) 1 | Cloud Height (m) 2 | Cloud Height (m) 3 | Cloud Height (m) 4 | Cloud Height (m) 5 | Cloud Height (m) 6 | Dew Point (C) | Humidity Fraction | Precipitation One Hour (mm) | Pressure Altimeter (mbar) | Pressure Sea Level (mbar) | Pressure Station (mbar) | Snow Depth (cm) | Temperature (C) | Visibility (km) | Weather Code 1 | Weather Code 1/ Description | Weather Code 2 | Weather Code 2/ Description | Weather Code 3 | Weather Code 3/ Description | Weather Code 4 | Weather Code 4/ Description | Weather Code 5 | Weather Code 5/ Description | Weather Code 6 | Weather Code 6/ Description | Weather Code Most Severe / Icon Code | Weather Code Most Severe | Weather Code Most Severe / Description | Wind Direction (degrees) | Wind Gust (m/s) | Wind Speed (m/s) |
\n",
"| type | int | int | int | int | int | int | int | int | int | real | real | real | real | real | int | int | int | real | real | real | int | int | int | real | real | real | real | int | int | int | real | real | int | enum | int | enum | int | enum | int | enum | int | enum | int | enum | int | int | enum | int | real | real |
\n",
"| mins | 2013.0 | 1.0 | 1.0 | 0.0 | 2013.0 | 1.0 | 1.0 | 0.0 | 0.0 | 61.0 | 0.0 | 0.0 | 0.25 | 0.5 | NaN | NaN | NaN | 60.96 | 213.36 | 365.76 | NaN | NaN | NaN | -26.7 | 0.1251 | 0.0 | 983.2949 | NaN | NaN | NaN | -15.6 | 0.001 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 3.0 | 0.0 | 0.0 | 1.0 | 0.0 | 10.0 | 7.2 | 0.0 |
\n",
"| mean | 2013.5 | 6.52602739726 | 15.7205479452 | 11.5 | 2013.50057078 | 6.52511415525 | 15.721347032 | 11.5001141553 | 0.0 | 1306.31195846 | 0.416742490522 | 0.361207349081 | 0.872445384073 | 0.963045685279 | 0.0 | 0.0 | 0.0 | 1293.9822682 | 1643.73900166 | 2084.89386376 | 0.0 | 0.0 | 0.0 | 4.31304646766 | 0.596736389159 | 1.37993010753 | 1017.82581441 | 0.0 | 0.0 | 0.0 | 12.5789090701 | 14.3914429682 | 4.84251968504 | NaN | 3.65867689358 | NaN | 2.84660766962 | NaN | 2.01149425287 | NaN | 4.125 | NaN | 3.0 | 0.0 | 1.37848173516 | 4.84251968504 | NaN | 194.69525682 | 9.42216948073 | 2.41032887849 |
\n",
"| maxs | 2014.0 | 12.0 | 31.0 | 23.0 | 2015.0 | 12.0 | 31.0 | 23.0 | 0.0 | 3657.6 | 1.0 | 1.0 | 1.0 | 1.0 | NaN | NaN | NaN | 3657.5999 | 3657.5999 | 3657.5999 | NaN | NaN | NaN | 24.4 | 1.0 | 26.924 | 1042.2113 | NaN | NaN | NaN | 36.1 | 16.0934 | 60.0 | 11.0 | 60.0 | 10.0 | 36.0 | 7.0 | 27.0 | 4.0 | 27.0 | 2.0 | 3.0 | 0.0 | 16.0 | 60.0 | 11.0 | 360.0 | 20.58 | 10.8 |
\n",
"| sigma | 0.500014270017 | 3.44794972385 | 8.79649804852 | 6.92238411188 | 0.500584411716 | 3.44782405458 | 8.79561488868 | 6.92230165203 | 0.0 | 995.339856966 | 0.462720830993 | 0.42770569708 | 0.197155690367 | 0.0861015598104 | -0.0 | -0.0 | -0.0 | 962.743095854 | 916.73861349 | 887.215847511 | -0.0 | -0.0 | -0.0 | 10.9731282097 | 0.185792011866 | 2.56215129179 | 7.46451697179 | -0.0 | -0.0 | -0.0 | 10.0396739531 | 3.69893623033 | 5.70486576983 | NaN | 6.13386253912 | NaN | 5.80553286364 | NaN | 3.12340844261 | NaN | 6.15223536611 | NaN | 0.0 | 0.0 | 4.07386062702 | 5.70486576983 | NaN | 106.350000031 | 1.81511871115 | 1.61469790524 |
\n",
"| zeros | 0 | 0 | 0 | 730 | 0 | 0 | 0 | 730 | 17455 | 0 | 8758 | 8758 | 0 | 0 | -17520 | -17520 | -17520 | 0 | 0 | 0 | -17520 | -17520 | -17520 | 268 | 0 | 501 | 0 | -17520 | -17520 | -17520 | 269 | 0 | 0 | 17 | 0 | 30 | 0 | 13 | -5044 | -5024 | -11241 | -11229 | -17030 | -17028 | 14980 | 0 | 17 | 0 | 0 | 2768 |
\n",
"| missing | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 65 | 10780 | 375 | 375 | 14682 | 16535 | 17520 | 17520 | 17520 | 9103 | 14683 | 16535 | 17520 | 17520 | 17520 | 67 | 67 | 15660 | 360 | 17520 | 17520 | 17520 | 67 | 412 | 14980 | 14980 | 16477 | 16477 | 17181 | 17181 | 17433 | 17433 | 17504 | 17504 | 17518 | 17518 | 0 | 14980 | 14980 | 9382 | 14381 | 1283 |
\n",
"| 0 | 2013.0 | 1.0 | 1.0 | 0.0 | 2013.0 | 1.0 | 1.0 | 5.0 | 0.0 | 2895.6 | 1.0 | 0.9 | 1.0 | nan | nan | nan | nan | 2895.5999 | 3352.8 | nan | nan | nan | nan | -5.0 | 0.5447 | nan | 1013.0917 | nan | nan | nan | 3.3 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | nan | nan | 2.57 |
\n",
"| 1 | 2013.0 | 1.0 | 1.0 | 1.0 | 2013.0 | 1.0 | 1.0 | 6.0 | 0.0 | 3048.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 3048.0 | nan | nan | nan | nan | nan | -4.4 | 0.5463 | nan | 1012.0759 | nan | nan | nan | 3.9 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | 260.0 | 9.77 | 4.63 |
\n",
"| 2 | 2013.0 | 1.0 | 1.0 | 2.0 | 2013.0 | 1.0 | 1.0 | 7.0 | 0.0 | 1828.8 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1828.7999 | nan | nan | nan | nan | nan | -3.3 | 0.619 | nan | 1012.4145 | nan | nan | nan | 3.3 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | nan | 7.72 | 1.54 |
\n",
"| 3 | 2013.0 | 1.0 | 1.0 | 3.0 | 2013.0 | 1.0 | 1.0 | 8.0 | 0.0 | 1463.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1463.04 | nan | nan | nan | nan | nan | -2.8 | 0.6159 | nan | 1012.4145 | nan | nan | nan | 3.9 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | nan | nan | 3.09 |
\n",
"| 4 | 2013.0 | 1.0 | 1.0 | 4.0 | 2013.0 | 1.0 | 1.0 | 9.0 | 0.0 | 1402.1 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1402.08 | nan | nan | nan | nan | nan | -2.8 | 0.6159 | nan | 1012.7531 | nan | nan | nan | 3.9 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | 260.0 | nan | 4.12 |
\n",
"| 5 | 2013.0 | 1.0 | 1.0 | 5.0 | 2013.0 | 1.0 | 1.0 | 10.0 | 0.0 | 1524.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1524.0 | nan | nan | nan | nan | nan | -2.8 | 0.6159 | nan | 1012.4145 | nan | nan | nan | 3.9 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | nan | nan | 3.09 |
\n",
"| 6 | 2013.0 | 1.0 | 1.0 | 6.0 | 2013.0 | 1.0 | 1.0 | 11.0 | 0.0 | 1524.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1524.0 | nan | nan | nan | nan | nan | -3.3 | 0.5934 | nan | 1012.0759 | nan | nan | nan | 3.9 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | nan | 9.26 | 3.09 |
\n",
"| 7 | 2013.0 | 1.0 | 1.0 | 7.0 | 2013.0 | 1.0 | 1.0 | 12.0 | 0.0 | 1524.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1524.0 | nan | nan | nan | nan | nan | -3.3 | 0.5934 | nan | 1012.4145 | nan | nan | nan | 3.9 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | 260.0 | 9.26 | 4.63 |
\n",
"| 8 | 2013.0 | 1.0 | 1.0 | 8.0 | 2013.0 | 1.0 | 1.0 | 13.0 | 0.0 | 1524.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1524.0 | nan | nan | nan | nan | nan | -2.8 | 0.6425 | nan | 1012.4145 | nan | nan | nan | 3.3 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | 260.0 | nan | 3.09 |
\n",
"| 9 | 2013.0 | 1.0 | 1.0 | 9.0 | 2013.0 | 1.0 | 1.0 | 14.0 | 0.0 | 1524.0 | 1.0 | 0.9 | 1.0 | nan | nan | nan | nan | 1524.0 | 3657.5999 | nan | nan | nan | nan | -2.8 | 0.6159 | nan | 1012.4145 | nan | nan | nan | 3.9 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | nan | 9.26 | 3.09 |
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# ----------\n",
"# 5- Now lets add some weather\n",
"# Load weather data\n",
"wthr1 = h2o.import_file(path=[mylocate(\"bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv\")])\n",
"# Peek at the data\n",
"wthr1.describe()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Rows:17,520 Cols:9\n",
"\n",
"Chunk compression summary:\n"
]
},
{
"data": {
"text/html": [
"| chunk_type | \n",
"chunk_name | \n",
"count | \n",
"count_percentage | \n",
"size | \n",
"size_percentage |
\n",
"| C0L | \n",
"Constant Integers | \n",
"46 | \n",
"15.0326805 | \n",
" 3.6 KB | \n",
"1.780005 |
\n",
"| C1 | \n",
"1-Byte Integers | \n",
"34 | \n",
"11.111112 | \n",
" 19.4 KB | \n",
"9.592678 |
\n",
"| C1N | \n",
"1-Byte Integers (w/o NAs) | \n",
"90 | \n",
"29.411766 | \n",
" 51.5 KB | \n",
"25.494701 |
\n",
"| C1S | \n",
"1-Byte Fractions | \n",
"42 | \n",
"13.725491 | \n",
" 24.0 KB | \n",
"11.894592 |
\n",
"| C2S | \n",
"2-Byte Fractions | \n",
"94 | \n",
"30.718956 | \n",
" 103.4 KB | \n",
"51.238026 |
"
],
"text/plain": [
"chunk_type chunk_name count count_percentage size size_percentage\n",
"------------ ------------------------- ------- ------------------ -------- -----------------\n",
"C0L Constant Integers 46 15.0327 3.6 KB 1.78001\n",
"C1 1-Byte Integers 34 11.1111 19.4 KB 9.59268\n",
"C1N 1-Byte Integers (w/o NAs) 90 29.4118 51.5 KB 25.4947\n",
"C1S 1-Byte Fractions 42 13.7255 24.0 KB 11.8946\n",
"C2S 2-Byte Fractions 94 30.719 103.4 KB 51.238"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Frame distribution summary:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"size | \n",
"number_of_rows | \n",
"number_of_chunks_per_column | \n",
"number_of_chunks |
\n",
"| 172.16.2.61:54321 | \n",
" 201.9 KB | \n",
"17520.0 | \n",
"34.0 | \n",
"306.0 |
\n",
"| mean | \n",
" 201.9 KB | \n",
"17520.0 | \n",
"34.0 | \n",
"306.0 |
\n",
"| min | \n",
" 201.9 KB | \n",
"17520.0 | \n",
"34.0 | \n",
"306.0 |
\n",
"| max | \n",
" 201.9 KB | \n",
"17520.0 | \n",
"34.0 | \n",
"306.0 |
\n",
"| stddev | \n",
" 0 B | \n",
"0.0 | \n",
"0.0 | \n",
"0.0 |
\n",
"| total | \n",
" 201.9 KB | \n",
"17520.0 | \n",
"34.0 | \n",
"306.0 |
"
],
"text/plain": [
" size number_of_rows number_of_chunks_per_column number_of_chunks\n",
"----------------- -------- ---------------- ----------------------------- ------------------\n",
"172.16.2.61:54321 201.9 KB 17520 34 306\n",
"mean 201.9 KB 17520 34 306\n",
"min 201.9 KB 17520 34 306\n",
"max 201.9 KB 17520 34 306\n",
"stddev 0 B 0 0 0\n",
"total 201.9 KB 17520 34 306"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"| | Year Local | Month Local | Day Local | Hour Local | Dew Point (C) | Humidity Fraction | Rain (mm) | Temperature (C) | WC1 |
\n",
"| type | int | int | int | int | real | real | real | real | enum |
\n",
"| mins | 2013.0 | 1.0 | 1.0 | 0.0 | -26.7 | 0.1251 | 0.0 | -15.6 | 0.0 |
\n",
"| mean | 2013.5 | 6.52602739726 | 15.7205479452 | 11.5 | 4.31304646766 | 0.596736389159 | 1.37993010753 | 12.5789090701 | NaN |
\n",
"| maxs | 2014.0 | 12.0 | 31.0 | 23.0 | 24.4 | 1.0 | 26.924 | 36.1 | 11.0 |
\n",
"| sigma | 0.500014270017 | 3.44794972385 | 8.79649804852 | 6.92238411188 | 10.9731282097 | 0.185792011866 | 2.56215129179 | 10.0396739531 | NaN |
\n",
"| zeros | 0 | 0 | 0 | 730 | 268 | 0 | 501 | 269 | 17 |
\n",
"| missing | 0 | 0 | 0 | 0 | 67 | 67 | 15660 | 67 | 14980 |
\n",
"| 0 | 2013.0 | 1.0 | 1.0 | 0.0 | -5.0 | 0.5447 | nan | 3.3 | |
\n",
"| 1 | 2013.0 | 1.0 | 1.0 | 1.0 | -4.4 | 0.5463 | nan | 3.9 | |
\n",
"| 2 | 2013.0 | 1.0 | 1.0 | 2.0 | -3.3 | 0.619 | nan | 3.3 | |
\n",
"| 3 | 2013.0 | 1.0 | 1.0 | 3.0 | -2.8 | 0.6159 | nan | 3.9 | |
\n",
"| 4 | 2013.0 | 1.0 | 1.0 | 4.0 | -2.8 | 0.6159 | nan | 3.9 | |
\n",
"| 5 | 2013.0 | 1.0 | 1.0 | 5.0 | -2.8 | 0.6159 | nan | 3.9 | |
\n",
"| 6 | 2013.0 | 1.0 | 1.0 | 6.0 | -3.3 | 0.5934 | nan | 3.9 | |
\n",
"| 7 | 2013.0 | 1.0 | 1.0 | 7.0 | -3.3 | 0.5934 | nan | 3.9 | |
\n",
"| 8 | 2013.0 | 1.0 | 1.0 | 8.0 | -2.8 | 0.6425 | nan | 3.3 | |
\n",
"| 9 | 2013.0 | 1.0 | 1.0 | 9.0 | -2.8 | 0.6159 | nan | 3.9 | |
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Lots of columns in there! Lets plan on converting to time-since-epoch to do\n",
"# a 'join' with the bike data, plus gather weather info that might affect\n",
"# cyclists - rain, snow, temperature. Alas, drop the \"snow\" column since it's\n",
"# all NA's. Also add in dew point and humidity just in case. Slice out just\n",
"# the columns of interest and drop the rest.\n",
"wthr2 = wthr1[[\"Year Local\",\"Month Local\",\"Day Local\",\"Hour Local\",\"Dew Point (C)\",\"Humidity Fraction\",\"Precipitation One Hour (mm)\",\"Temperature (C)\",\"Weather Code 1/ Description\"]]\n",
"\n",
"wthr2.set_name(wthr2.names.index(\"Precipitation One Hour (mm)\"), \"Rain (mm)\")\n",
"wthr2.set_name(wthr2.names.index(\"Weather Code 1/ Description\"), \"WC1\")\n",
"wthr2.describe()\n",
"# Much better! "
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Filter down to the weather at Noon\n",
"wthr3 = wthr2[ wthr2[\"Hour Local\"]==12 ]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Rows:730 Cols:11\n",
"\n",
"Chunk compression summary:\n"
]
},
{
"data": {
"text/html": [
"| chunk_type | \n",
"chunk_name | \n",
"count | \n",
"count_percentage | \n",
"size | \n",
"size_percentage |
\n",
"| C0L | \n",
"Constant Integers | \n",
"80 | \n",
"21.390373 | \n",
" 6.3 KB | \n",
"12.498779 |
\n",
"| C0D | \n",
"Constant Reals | \n",
"13 | \n",
"3.4759357 | \n",
" 1.0 KB | \n",
"2.0310516 |
\n",
"| C1 | \n",
"1-Byte Integers | \n",
"30 | \n",
"8.021391 | \n",
" 2.6 KB | \n",
"5.2455816 |
\n",
"| C1N | \n",
"1-Byte Integers (w/o NAs) | \n",
"56 | \n",
"14.973262 | \n",
" 4.9 KB | \n",
"9.801778 |
\n",
"| C1S | \n",
"1-Byte Fractions | \n",
"34 | \n",
"9.090909 | \n",
" 3.5 KB | \n",
"7.0032225 |
\n",
"| C2S | \n",
"2-Byte Fractions | \n",
"34 | \n",
"9.090909 | \n",
" 4.2 KB | \n",
"8.4288645 |
\n",
"| CUD | \n",
"Unique Reals | \n",
"25 | \n",
"6.6844916 | \n",
" 3.6 KB | \n",
"7.2297626 |
\n",
"| C8D | \n",
"64-bit Reals | \n",
"102 | \n",
"27.272728 | \n",
" 23.9 KB | \n",
"47.76096 |
"
],
"text/plain": [
"chunk_type chunk_name count count_percentage size size_percentage\n",
"------------ ------------------------- ------- ------------------ ------- -----------------\n",
"C0L Constant Integers 80 21.3904 6.3 KB 12.4988\n",
"C0D Constant Reals 13 3.47594 1.0 KB 2.03105\n",
"C1 1-Byte Integers 30 8.02139 2.6 KB 5.24558\n",
"C1N 1-Byte Integers (w/o NAs) 56 14.9733 4.9 KB 9.80178\n",
"C1S 1-Byte Fractions 34 9.09091 3.5 KB 7.00322\n",
"C2S 2-Byte Fractions 34 9.09091 4.2 KB 8.42886\n",
"CUD Unique Reals 25 6.68449 3.6 KB 7.22976\n",
"C8D 64-bit Reals 102 27.2727 23.9 KB 47.761"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Frame distribution summary:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"size | \n",
"number_of_rows | \n",
"number_of_chunks_per_column | \n",
"number_of_chunks |
\n",
"| 172.16.2.61:54321 | \n",
" 50.0 KB | \n",
"730.0 | \n",
"34.0 | \n",
"374.0 |
\n",
"| mean | \n",
" 50.0 KB | \n",
"730.0 | \n",
"34.0 | \n",
"374.0 |
\n",
"| min | \n",
" 50.0 KB | \n",
"730.0 | \n",
"34.0 | \n",
"374.0 |
\n",
"| max | \n",
" 50.0 KB | \n",
"730.0 | \n",
"34.0 | \n",
"374.0 |
\n",
"| stddev | \n",
" 0 B | \n",
"0.0 | \n",
"0.0 | \n",
"0.0 |
\n",
"| total | \n",
" 50.0 KB | \n",
"730.0 | \n",
"34.0 | \n",
"374.0 |
"
],
"text/plain": [
" size number_of_rows number_of_chunks_per_column number_of_chunks\n",
"----------------- ------- ---------------- ----------------------------- ------------------\n",
"172.16.2.61:54321 50.0 KB 730 34 374\n",
"mean 50.0 KB 730 34 374\n",
"min 50.0 KB 730 34 374\n",
"max 50.0 KB 730 34 374\n",
"stddev 0 B 0 0 0\n",
"total 50.0 KB 730 34 374"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"| | Year Local | Month Local | Day Local | Hour Local | Dew Point (C) | Humidity Fraction | Rain (mm) | Temperature (C) | WC1 | msec | Days |
\n",
"| type | int | int | int | int | real | real | real | real | enum | int | int |
\n",
"| mins | 2013.0 | 1.0 | 1.0 | 12.0 | -26.7 | 0.1723 | 0.0 | -13.9 | 0.0 | 1.3570704e+12 | 15706.0 |
\n",
"| mean | 2013.5 | 6.52602739726 | 15.7205479452 | 12.0 | 4.23012379642 | 0.539728198074 | 1.53125714286 | 14.0687757909 | NaN | 1.3885608526e+12 | 16070.5 |
\n",
"| maxs | 2014.0 | 12.0 | 31.0 | 12.0 | 23.3 | 1.0 | 12.446 | 34.4 | 10.0 | 1.420056e+12 | 16435.0 |
\n",
"| sigma | 0.500342818004 | 3.45021529307 | 8.80227802701 | 0.0 | 11.1062964725 | 0.179945027923 | 2.36064248615 | 10.3989855149 | NaN | 18219740080.4 | 210.877136425 |
\n",
"| zeros | 0 | 0 | 0 | 0 | 14 | 0 | -174 | 7 | -83 | 0 | 0 |
\n",
"| missing | 0 | 0 | 0 | 0 | 3 | 3 | 660 | 3 | 620 | 0 | 0 |
\n",
"| 0 | 2013.0 | 1.0 | 1.0 | 12.0 | -3.3 | 0.5934 | nan | 3.9 | | 1.3570704e+12 | 15706.0 |
\n",
"| 1 | 2013.0 | 1.0 | 2.0 | 12.0 | -11.7 | 0.4806 | nan | -2.2 | | 1.3571568e+12 | 15707.0 |
\n",
"| 2 | 2013.0 | 1.0 | 3.0 | 12.0 | -10.6 | 0.5248 | nan | -2.2 | | 1.3572432e+12 | 15708.0 |
\n",
"| 3 | 2013.0 | 1.0 | 4.0 | 12.0 | -7.2 | 0.4976 | nan | 2.2 | | 1.3573296e+12 | 15709.0 |
\n",
"| 4 | 2013.0 | 1.0 | 5.0 | 12.0 | -7.2 | 0.426 | nan | 4.4 | | 1.357416e+12 | 15710.0 |
\n",
"| 5 | 2013.0 | 1.0 | 6.0 | 12.0 | -1.7 | 0.6451 | nan | 4.4 | haze | 1.3575024e+12 | 15711.0 |
\n",
"| 6 | 2013.0 | 1.0 | 7.0 | 12.0 | -6.1 | 0.4119 | nan | 6.1 | | 1.3575888e+12 | 15712.0 |
\n",
"| 7 | 2013.0 | 1.0 | 8.0 | 12.0 | -1.7 | 0.5314 | nan | 7.2 | | 1.3576752e+12 | 15713.0 |
\n",
"| 8 | 2013.0 | 1.0 | 9.0 | 12.0 | 0.6 | 0.56 | nan | 8.9 | haze | 1.3577616e+12 | 15714.0 |
\n",
"| 9 | 2013.0 | 1.0 | 10.0 | 12.0 | -6.1 | 0.3952 | nan | 6.7 | | 1.357848e+12 | 15715.0 |
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Lets now get Days since the epoch... we'll convert year/month/day into Epoch\n",
"# time, and then back to Epoch days. Need zero-based month and days, but have\n",
"# 1-based.\n",
"wthr3[\"msec\"] = h2o.H2OFrame.mktime(year=wthr3[\"Year Local\"], month=wthr3[\"Month Local\"]-1, day=wthr3[\"Day Local\"]-1, hour=wthr3[\"Hour Local\"])\n",
"secsPerDay=1000*60*60*24\n",
"wthr3[\"Days\"] = (wthr3[\"msec\"]/secsPerDay).floor()\n",
"wthr3.describe()\n",
"# msec looks sane (numbers like 1.3e12 are in the correct range for msec since\n",
"# 1970). Epoch Days matches closely with the epoch day numbers from the\n",
"# CitiBike dataset. "
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Lets drop off the extra time columns to make a easy-to-handle dataset.\n",
"wthr4 = wthr3.drop(\"Year Local\").drop(\"Month Local\").drop(\"Day Local\").drop(\"Hour Local\").drop(\"msec\")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Also, most rain numbers are missing - lets assume those are zero rain days\n",
"rain = wthr4[\"Rain (mm)\"]\n",
"rain[ rain.isna() ] = 0\n",
"wthr4[\"Rain (mm)\"] = rain"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Merge Daily Weather with Bikes-Per-Day\n",
"Rows:10,450 Cols:10\n",
"\n",
"Chunk compression summary:\n"
]
},
{
"data": {
"text/html": [
"| chunk_type | \n",
"chunk_name | \n",
"count | \n",
"count_percentage | \n",
"size | \n",
"size_percentage |
\n",
"| C0L | \n",
"Constant Integers | \n",
"66 | \n",
"20.625 | \n",
" 5.2 KB | \n",
"3.6253278 |
\n",
"| C0D | \n",
"Constant Reals | \n",
"33 | \n",
"10.3125 | \n",
" 2.6 KB | \n",
"1.8126639 |
\n",
"| CBS | \n",
"Bits | \n",
"6 | \n",
"1.8750001 | \n",
" 666 B | \n",
"0.4572857 |
\n",
"| C1 | \n",
"1-Byte Integers | \n",
"4 | \n",
"1.25 | \n",
" 1.5 KB | \n",
"1.0821055 |
\n",
"| C1N | \n",
"1-Byte Integers (w/o NAs) | \n",
"28 | \n",
"8.75 | \n",
" 10.8 KB | \n",
"7.599456 |
\n",
"| C1S | \n",
"1-Byte Fractions | \n",
"31 | \n",
"9.6875 | \n",
" 12.4 KB | \n",
"8.739238 |
\n",
"| C2 | \n",
"2-Byte Integers | \n",
"63 | \n",
"19.6875 | \n",
" 44.3 KB | \n",
"31.169582 |
\n",
"| CUD | \n",
"Unique Reals | \n",
"89 | \n",
"27.812498 | \n",
" 64.7 KB | \n",
"45.514343 |
"
],
"text/plain": [
"chunk_type chunk_name count count_percentage size size_percentage\n",
"------------ ------------------------- ------- ------------------ ------- -----------------\n",
"C0L Constant Integers 66 20.625 5.2 KB 3.62533\n",
"C0D Constant Reals 33 10.3125 2.6 KB 1.81266\n",
"CBS Bits 6 1.875 666 B 0.457286\n",
"C1 1-Byte Integers 4 1.25 1.5 KB 1.08211\n",
"C1N 1-Byte Integers (w/o NAs) 28 8.75 10.8 KB 7.59946\n",
"C1S 1-Byte Fractions 31 9.6875 12.4 KB 8.73924\n",
"C2 2-Byte Integers 63 19.6875 44.3 KB 31.1696\n",
"CUD Unique Reals 89 27.8125 64.7 KB 45.5143"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Frame distribution summary:\n"
]
},
{
"data": {
"text/html": [
" | \n",
"size | \n",
"number_of_rows | \n",
"number_of_chunks_per_column | \n",
"number_of_chunks |
\n",
"| 172.16.2.61:54321 | \n",
" 142.2 KB | \n",
"10450.0 | \n",
"32.0 | \n",
"320.0 |
\n",
"| mean | \n",
" 142.2 KB | \n",
"10450.0 | \n",
"32.0 | \n",
"320.0 |
\n",
"| min | \n",
" 142.2 KB | \n",
"10450.0 | \n",
"32.0 | \n",
"320.0 |
\n",
"| max | \n",
" 142.2 KB | \n",
"10450.0 | \n",
"32.0 | \n",
"320.0 |
\n",
"| stddev | \n",
" 0 B | \n",
"0.0 | \n",
"0.0 | \n",
"0.0 |
\n",
"| total | \n",
" 142.2 KB | \n",
"10450.0 | \n",
"32.0 | \n",
"320.0 |
"
],
"text/plain": [
" size number_of_rows number_of_chunks_per_column number_of_chunks\n",
"----------------- -------- ---------------- ----------------------------- ------------------\n",
"172.16.2.61:54321 142.2 KB 10450 32 320\n",
"mean 142.2 KB 10450 32 320\n",
"min 142.2 KB 10450 32 320\n",
"max 142.2 KB 10450 32 320\n",
"stddev 0 B 0 0 0\n",
"total 142.2 KB 10450 32 320"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"| | Days | start station name | bikes | Month | DayOfWeek | Humidity Fraction | Rain (mm) | Temperature (C) | WC1 | Dew Point (C) |
\n",
"| type | int | enum | int | enum | enum | real | int | real | enum | real |
\n",
"| mins | 15979.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.3485 | 0.0 | 9.4 | 2.0 | -2.2 |
\n",
"| mean | 15994.4415311 | NaN | 99.3025837321 | 0.968612440191 | NaN | 0.562374191388 | 0.0 | 16.9630717703 | NaN | 7.77999043062 |
\n",
"| maxs | 16010.0 | 329.0 | 553.0 | 1.0 | 6.0 | 0.8718 | 0.0 | 26.1 | 8.0 | 19.4 |
\n",
"| sigma | 9.23370172444 | NaN | 72.9721964301 | 0.174371128617 | NaN | 0.149631413472 | 0.0 | 4.29746634617 | NaN | 6.49151146664 |
\n",
"| zeros | 0 | 32 | 0 | 328 | 1635 | 0 | 10450 | 0 | -8494 | 0 |
\n",
"| missing | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9134 | 0 |
\n",
"| 0 | 15979.0 | 1 Ave & E 15 St | 97.0 | 9 | Mon | 0.4315 | 0.0 | 23.9 | | 10.6 |
\n",
"| 1 | 15979.0 | 1 Ave & E 18 St | 75.0 | 9 | Mon | 0.4315 | 0.0 | 23.9 | | 10.6 |
\n",
"| 2 | 15979.0 | 1 Ave & E 30 St | 113.0 | 9 | Mon | 0.4315 | 0.0 | 23.9 | | 10.6 |
\n",
"| 3 | 15979.0 | 10 Ave & W 28 St | 74.0 | 9 | Mon | 0.4315 | 0.0 | 23.9 | | 10.6 |
\n",
"| 4 | 15979.0 | 11 Ave & W 27 St | 139.0 | 9 | Mon | 0.4315 | 0.0 | 23.9 | | 10.6 |
\n",
"| 5 | 15979.0 | 11 Ave & W 41 St | 60.0 | 9 | Mon | 0.4315 | 0.0 | 23.9 | | 10.6 |
\n",
"| 6 | 15979.0 | 12 Ave & W 40 St | 90.0 | 9 | Mon | 0.4315 | 0.0 | 23.9 | | 10.6 |
\n",
"| 7 | 15979.0 | 2 Ave & E 31 St | 88.0 | 9 | Mon | 0.4315 | 0.0 | 23.9 | | 10.6 |
\n",
"| 8 | 15979.0 | 2 Ave & E 58 St | 55.0 | 9 | Mon | 0.4315 | 0.0 | 23.9 | | 10.6 |
\n",
"| 9 | 15979.0 | 3 Ave & Schermerhorn St | 8.0 | 9 | Mon | 0.4315 | 0.0 | 23.9 | | 10.6 |
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"| Days | start station name | bikes | Month | DayOfWeek | Humidity Fraction | Rain (mm) | Temperature (C) | WC1 | Dew Point (C) |
\n",
"| 15979 | 1 Ave & E 15 St | 97 | 9 | Mon | 0.4315 | 0 | 23.9 | | 10.6 |
\n",
"| 15979 | 1 Ave & E 18 St | 75 | 9 | Mon | 0.4315 | 0 | 23.9 | | 10.6 |
\n",
"| 15979 | 1 Ave & E 30 St | 113 | 9 | Mon | 0.4315 | 0 | 23.9 | | 10.6 |
\n",
"| 15979 | 10 Ave & W 28 St | 74 | 9 | Mon | 0.4315 | 0 | 23.9 | | 10.6 |
\n",
"| 15979 | 11 Ave & W 27 St | 139 | 9 | Mon | 0.4315 | 0 | 23.9 | | 10.6 |
\n",
"| 15979 | 11 Ave & W 41 St | 60 | 9 | Mon | 0.4315 | 0 | 23.9 | | 10.6 |
\n",
"| 15979 | 12 Ave & W 40 St | 90 | 9 | Mon | 0.4315 | 0 | 23.9 | | 10.6 |
\n",
"| 15979 | 2 Ave & E 31 St | 88 | 9 | Mon | 0.4315 | 0 | 23.9 | | 10.6 |
\n",
"| 15979 | 2 Ave & E 58 St | 55 | 9 | Mon | 0.4315 | 0 | 23.9 | | 10.6 |
\n",
"| 15979 | 3 Ave & Schermerhorn St | 8 | 9 | Mon | 0.4315 | 0 | 23.9 | | 10.6 |
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# ----------\n",
"# 6 - Join the weather data-per-day to the bike-starts-per-day\n",
"print(\"Merge Daily Weather with Bikes-Per-Day\")\n",
"bpd_with_weather = bpd.merge(wthr4,all_x=True,all_y=False)\n",
"bpd_with_weather.describe()\n",
"bpd_with_weather.show()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training data has 10 columns and 6289 rows, test has 3080 rows, holdout has 1081\n",
"\n",
"gbm Model Build Progress: [##################################################] 100%\n",
"\n",
"drf Model Build Progress: [##################################################] 100%\n",
"\n",
"glm Model Build Progress: [##################################################] 100%\n",
"\n",
"deeplearning Model Build Progress: [##################################################] 100%\n"
]
},
{
"data": {
"text/html": [
"| Model | \n",
"R2 TRAIN | \n",
"R2 TEST | \n",
"R2 HOLDOUT | \n",
"Model Training Time (s) |
\n",
"| GBM | \n",
"0.9954410 | \n",
"0.9255962 | \n",
"0.9230051 | \n",
"6.706 |
\n",
"| DRF | \n",
"0.8491125 | \n",
"0.7430226 | \n",
"0.7442895 | \n",
"6.692 |
\n",
"| GLM | \n",
"0.8660565 | \n",
"0.8446801 | \n",
"0.8673705 | \n",
"0.139 |
\n",
"| DL | \n",
"0.9617874 | \n",
"0.9117793 | \n",
"0.9213475 | \n",
"7.972 |
"
],
"text/plain": [
"Model R2 TRAIN R2 TEST R2 HOLDOUT Model Training Time (s)\n",
"------- ---------- --------- ------------ -------------------------\n",
"GBM 0.995441 0.925596 0.923005 6.706\n",
"DRF 0.849112 0.743023 0.744289 6.692\n",
"GLM 0.866057 0.84468 0.867371 0.139\n",
"DL 0.961787 0.911779 0.921347 7.972"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 7 - Test/Train split again, model build again, this time with weather\n",
"split_fit_predict(bpd_with_weather)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}