{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "import h2o\n", "import time\n", "from h2o.estimators.glm import H2OGeneralizedLinearEstimator\n", "from h2o.estimators.gbm import H2OGradientBoostingEstimator\n", "from h2o.estimators.random_forest import H2ORandomForestEstimator\n", "from h2o.estimators.deeplearning import H2ODeepLearningEstimator" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Checking whether there is an H2O instance running at http://localhost:54321. connected.\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td>H2O cluster uptime:</td>\n", "<td>08 secs</td></tr>\n", "<tr><td>H2O cluster version:</td>\n", "<td>3.11.0.99999</td></tr>\n", "<tr><td>H2O cluster version age:</td>\n", "<td>1 minute </td></tr>\n", "<tr><td>H2O cluster name:</td>\n", "<td>pasha</td></tr>\n", "<tr><td>H2O cluster total nodes:</td>\n", "<td>1</td></tr>\n", "<tr><td>H2O cluster free memory:</td>\n", "<td>3.556 Gb</td></tr>\n", "<tr><td>H2O cluster total cores:</td>\n", "<td>8</td></tr>\n", "<tr><td>H2O cluster allowed cores:</td>\n", "<td>8</td></tr>\n", "<tr><td>H2O cluster status:</td>\n", "<td>accepting new members, healthy</td></tr>\n", "<tr><td>H2O connection url:</td>\n", "<td>http://localhost:54321</td></tr>\n", "<tr><td>H2O connection proxy:</td>\n", "<td>None</td></tr>\n", "<tr><td>Python version:</td>\n", "<td>3.5.2 final</td></tr></table></div>" ], "text/plain": [ "-------------------------- ------------------------------\n", "H2O cluster uptime: 08 secs\n", "H2O cluster version: 3.11.0.99999\n", "H2O cluster version age: 1 minute\n", "H2O cluster name: pasha\n", "H2O cluster total nodes: 1\n", "H2O cluster free memory: 3.556 Gb\n", "H2O cluster total cores: 8\n", "H2O cluster allowed cores: 8\n", "H2O cluster status: accepting new members, healthy\n", "H2O connection url: http://localhost:54321\n", "H2O connection proxy:\n", "Python version: 3.5.2 final\n", "-------------------------- ------------------------------" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Explore a typical Data Science workflow with H2O and Python\n", "#\n", "# Goal: assist the manager of CitiBike of NYC to load-balance the bicycles\n", "# across the CitiBike network of stations, by predicting the number of bike\n", "# trips taken from the station every day. Use 10 million rows of historical\n", "# data, and eventually add weather data.\n", "\n", "\n", "# Connect to a cluster\n", "h2o.init()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.\n", "\n", "# Set this to True if you want to fetch the data directly from S3.\n", "# This is useful if your cluster is running in EC2.\n", "data_source_is_s3 = False\n", "\n", "def mylocate(s):\n", " if data_source_is_s3:\n", " return \"s3n://h2o-public-test-data/\" + s\n", " else:\n", " return _locate(s)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Import and Parse bike data\n", "Parse progress: |█████████████████████████████████████████████████████████| 100%\n" ] } ], "source": [ "# Pick either the big or the small demo.\n", "# Big data is 10M rows\n", "small_test = [mylocate(\"bigdata/laptop/citibike-nyc/2013-10.csv\")]\n", "big_test = [mylocate(\"bigdata/laptop/citibike-nyc/2013-07.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-08.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-09.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-10.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-11.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-12.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-01.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-02.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-03.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-04.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-05.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-06.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-07.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-08.csv\")]\n", "\n", "# ----------\n", "\n", "# 1- Load data - 1 row per bicycle trip. Has columns showing the start and end\n", "# station, trip duration and trip start time and day. The larger dataset\n", "# totals about 10 million rows\n", "print(\"Import and Parse bike data\")\n", "data = h2o.import_file(path=small_test)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows:1037712\n", "Cols:16\n", "\n", "\n" ] }, { "data": { "text/html": [ "<table>\n", "<thead>\n", "<tr><th> </th><th>tripduration </th><th>starttime </th><th>stoptime </th><th>start station id </th><th>start station name </th><th>start station latitude </th><th>start station longitude </th><th>end station id </th><th>end station name </th><th>end station latitude </th><th>end station longitude </th><th>bikeid </th><th>usertype </th><th>birth year </th><th>gender </th><th>Days </th></tr>\n", "</thead>\n", "<tbody>\n", "<tr><td>type </td><td>int </td><td>time </td><td>time </td><td>int </td><td>enum </td><td>real </td><td>real </td><td>int </td><td>enum </td><td>real </td><td>real </td><td>int </td><td>enum </td><td>int </td><td>int </td><td>int </td></tr>\n", "<tr><td>mins </td><td>60.0 </td><td>1380585668000.0 </td><td>1380585883000.0 </td><td>72.0 </td><td> </td><td>40.680342423 </td><td>-74.01713445 </td><td>72.0 </td><td> </td><td>40.680342423 </td><td>-74.01713445 </td><td>14529.0 </td><td> </td><td>1899.0 </td><td>0.0 </td><td>15979.0 </td></tr>\n", "<tr><td>mean </td><td>825.6147543827192 </td><td>1381888516917.714 </td><td>1381889342532.4746 </td><td>443.7142126139049 </td><td> </td><td>40.73451885864454 </td><td>-73.99113288482197 </td><td>443.20742171238254</td><td> </td><td>40.73428478848875 </td><td>-73.99127029824423 </td><td>17644.071645119242</td><td> </td><td>1975.7783948601839</td><td>1.123755916863252</td><td>15993.476745956474</td></tr>\n", "<tr><td>maxs </td><td>1259480.0 </td><td>1383263997000.0 </td><td>1383393310000.0 </td><td>3002.0 </td><td> </td><td>40.770513 </td><td>-73.9500479759 </td><td>3002.0 </td><td> </td><td>40.770513 </td><td>-73.9500479759 </td><td>20757.0 </td><td> </td><td>1997.0 </td><td>2.0 </td><td>16009.0 </td></tr>\n", "<tr><td>sigma </td><td>2000.3732322961862</td><td>778871729.1323168 </td><td>778847387.5037588 </td><td>354.43432507453724</td><td> </td><td>0.01957340730530415 </td><td>0.012316123410581171 </td><td>357.39821705755827</td><td> </td><td>0.019557845811587957 </td><td>0.012385581196537298 </td><td>1717.6811213447866</td><td> </td><td>11.131490623834942</td><td>0.544380593291009</td><td>9.014533519116712 </td></tr>\n", "<tr><td>zeros </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td> </td><td>0 </td><td>0 </td><td>0 </td><td> </td><td>0 </td><td>0 </td><td>0 </td><td> </td><td>0 </td><td>97498 </td><td>0 </td></tr>\n", "<tr><td>missing</td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>97445 </td><td>0 </td><td>0 </td></tr>\n", "<tr><td>0 </td><td>326.0 </td><td>2013-10-01 00:01:08</td><td>2013-10-01 00:06:34</td><td>239.0 </td><td>Willoughby St & Fleet St</td><td>40.69196566 </td><td>-73.9813018 </td><td>366.0 </td><td>Clinton Ave & Myrtle Ave </td><td>40.693261 </td><td>-73.968896 </td><td>16052.0 </td><td>Subscriber</td><td>1982.0 </td><td>1.0 </td><td>15979.0 </td></tr>\n", "<tr><td>1 </td><td>729.0 </td><td>2013-10-01 00:01:21</td><td>2013-10-01 00:13:30</td><td>322.0 </td><td>Clinton St & Tillary St </td><td>40.696191999999996 </td><td>-73.991218 </td><td>398.0 </td><td>Atlantic Ave & Furman St </td><td>40.69165183 </td><td>-73.99997859999999 </td><td>19412.0 </td><td>Customer </td><td>nan </td><td>0.0 </td><td>15979.0 </td></tr>\n", "<tr><td>2 </td><td>520.0 </td><td>2013-10-01 00:01:24</td><td>2013-10-01 00:10:04</td><td>174.0 </td><td>E 25 St & 1 Ave </td><td>40.7381765 </td><td>-73.97738662 </td><td>403.0 </td><td>E 2 St & 2 Ave </td><td>40.72502876 </td><td>-73.99069656 </td><td>19645.0 </td><td>Subscriber</td><td>1984.0 </td><td>1.0 </td><td>15979.0 </td></tr>\n", "<tr><td>3 </td><td>281.0 </td><td>2013-10-01 00:01:25</td><td>2013-10-01 00:06:06</td><td>430.0 </td><td>York St & Jay St </td><td>40.7014851 </td><td>-73.98656928 </td><td>323.0 </td><td>Lawrence St & Willoughby St </td><td>40.69236178 </td><td>-73.98631746 </td><td>16992.0 </td><td>Subscriber</td><td>1985.0 </td><td>1.0 </td><td>15979.0 </td></tr>\n", "<tr><td>4 </td><td>196.0 </td><td>2013-10-01 00:01:27</td><td>2013-10-01 00:04:43</td><td>403.0 </td><td>E 2 St & 2 Ave </td><td>40.72502876 </td><td>-73.99069656 </td><td>401.0 </td><td>Allen St & Rivington St </td><td>40.72019576 </td><td>-73.98997825000001 </td><td>15690.0 </td><td>Subscriber</td><td>1986.0 </td><td>1.0 </td><td>15979.0 </td></tr>\n", "<tr><td>5 </td><td>1948.0 </td><td>2013-10-01 00:01:48</td><td>2013-10-01 00:34:16</td><td>369.0 </td><td>Washington Pl & 6 Ave </td><td>40.73224119 </td><td>-74.00026394 </td><td>307.0 </td><td>Canal St & Rutgers St </td><td>40.714274870000004 </td><td>-73.98990025 </td><td>19846.0 </td><td>Subscriber</td><td>1977.0 </td><td>1.0 </td><td>15979.0 </td></tr>\n", "<tr><td>6 </td><td>1327.0 </td><td>2013-10-01 00:01:48</td><td>2013-10-01 00:23:55</td><td>254.0 </td><td>W 11 St & 6 Ave </td><td>40.73532427 </td><td>-73.99800419 </td><td>539.0 </td><td>Metropolitan Ave & Bedford Ave</td><td>40.71534825 </td><td>-73.96024116 </td><td>14563.0 </td><td>Subscriber</td><td>1986.0 </td><td>2.0 </td><td>15979.0 </td></tr>\n", "<tr><td>7 </td><td>1146.0 </td><td>2013-10-01 00:01:57</td><td>2013-10-01 00:21:03</td><td>490.0 </td><td>8 Ave & W 33 St </td><td>40.751551 </td><td>-73.993934 </td><td>438.0 </td><td>St Marks Pl & 1 Ave </td><td>40.727791260000004 </td><td>-73.98564945 </td><td>16793.0 </td><td>Subscriber</td><td>1959.0 </td><td>1.0 </td><td>15979.0 </td></tr>\n", "<tr><td>8 </td><td>380.0 </td><td>2013-10-01 00:01:58</td><td>2013-10-01 00:08:18</td><td>468.0 </td><td>Broadway & W 55 St </td><td>40.7652654 </td><td>-73.98192338 </td><td>385.0 </td><td>E 55 St & 2 Ave </td><td>40.757973220000004 </td><td>-73.96603308 </td><td>16600.0 </td><td>Customer </td><td>nan </td><td>0.0 </td><td>15979.0 </td></tr>\n", "<tr><td>9 </td><td>682.0 </td><td>2013-10-01 00:02:05</td><td>2013-10-01 00:13:27</td><td>300.0 </td><td>Shevchenko Pl & E 6 St </td><td>40.728145 </td><td>-73.990214 </td><td>519.0 </td><td>Pershing Square N </td><td>40.75188406 </td><td>-73.97770164 </td><td>15204.0 </td><td>Subscriber</td><td>1992.0 </td><td>1.0 </td><td>15979.0 </td></tr>\n", "</tbody>\n", "</table>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# ----------\n", "\n", "# 2- light data munging: group the bike starts per-day, converting the 10M rows\n", "# of trips to about 140,000 station&day combos - predicting the number of trip\n", "# starts per-station-per-day.\n", "\n", "# Convert start time to: Day since the Epoch\n", "startime = data[\"starttime\"]\n", "secsPerDay = 1000 * 3600 * 24\n", "data[\"Days\"] = (startime.asnumeric() / secsPerDay).floor()\n", "data.describe()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<table>\n", "<thead>\n", "<tr><th style=\"text-align: right;\"> Days</th><th>start station name </th><th style=\"text-align: right;\"> bikes</th></tr>\n", "</thead>\n", "<tbody>\n", "<tr><td style=\"text-align: right;\"> 15979</td><td>1 Ave & E 15 St </td><td style=\"text-align: right;\"> 173</td></tr>\n", "<tr><td style=\"text-align: right;\"> 15979</td><td>1 Ave & E 18 St </td><td style=\"text-align: right;\"> 118</td></tr>\n", "<tr><td style=\"text-align: right;\"> 15979</td><td>1 Ave & E 30 St </td><td style=\"text-align: right;\"> 152</td></tr>\n", "<tr><td style=\"text-align: right;\"> 15979</td><td>10 Ave & W 28 St </td><td style=\"text-align: right;\"> 115</td></tr>\n", "<tr><td style=\"text-align: right;\"> 15979</td><td>11 Ave & W 27 St </td><td style=\"text-align: right;\"> 210</td></tr>\n", "<tr><td style=\"text-align: right;\"> 15979</td><td>11 Ave & W 41 St </td><td style=\"text-align: right;\"> 106</td></tr>\n", "<tr><td style=\"text-align: right;\"> 15979</td><td>12 Ave & W 40 St </td><td style=\"text-align: right;\"> 144</td></tr>\n", "<tr><td style=\"text-align: right;\"> 15979</td><td>2 Ave & E 31 St </td><td style=\"text-align: right;\"> 206</td></tr>\n", "<tr><td style=\"text-align: right;\"> 15979</td><td>2 Ave & E 58 St </td><td style=\"text-align: right;\"> 105</td></tr>\n", "<tr><td style=\"text-align: right;\"> 15979</td><td>3 Ave & Schermerhorn St</td><td style=\"text-align: right;\"> 15</td></tr>\n", "</tbody>\n", "</table>" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Rows:10131\n", "Cols:3\n", "\n", "\n" ] }, { "data": { "text/html": [ "<table>\n", "<thead>\n", "<tr><th> </th><th>Days </th><th>start station name </th><th>bikes </th></tr>\n", "</thead>\n", "<tbody>\n", "<tr><td>type </td><td>int </td><td>enum </td><td>int </td></tr>\n", "<tr><td>mins </td><td>15979.0 </td><td> </td><td>1.0 </td></tr>\n", "<tr><td>mean </td><td>15993.953311617806</td><td> </td><td>102.42937518507551</td></tr>\n", "<tr><td>maxs </td><td>16009.0 </td><td> </td><td>603.0 </td></tr>\n", "<tr><td>sigma </td><td>8.950698111468864 </td><td> </td><td>74.05933443246006 </td></tr>\n", "<tr><td>zeros </td><td>0 </td><td> </td><td>0 </td></tr>\n", "<tr><td>missing</td><td>0 </td><td>0 </td><td>0 </td></tr>\n", "<tr><td>0 </td><td>15979.0 </td><td>1 Ave & E 15 St </td><td>173.0 </td></tr>\n", "<tr><td>1 </td><td>15979.0 </td><td>1 Ave & E 18 St </td><td>118.0 </td></tr>\n", "<tr><td>2 </td><td>15979.0 </td><td>1 Ave & E 30 St </td><td>152.0 </td></tr>\n", "<tr><td>3 </td><td>15979.0 </td><td>10 Ave & W 28 St </td><td>115.0 </td></tr>\n", "<tr><td>4 </td><td>15979.0 </td><td>11 Ave & W 27 St </td><td>210.0 </td></tr>\n", "<tr><td>5 </td><td>15979.0 </td><td>11 Ave & W 41 St </td><td>106.0 </td></tr>\n", "<tr><td>6 </td><td>15979.0 </td><td>12 Ave & W 40 St </td><td>144.0 </td></tr>\n", "<tr><td>7 </td><td>15979.0 </td><td>2 Ave & E 31 St </td><td>206.0 </td></tr>\n", "<tr><td>8 </td><td>15979.0 </td><td>2 Ave & E 58 St </td><td>105.0 </td></tr>\n", "<tr><td>9 </td><td>15979.0 </td><td>3 Ave & Schermerhorn St</td><td>15.0 </td></tr>\n", "</tbody>\n", "</table>" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[10131, 3]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Now do a monster Group-By. Count bike starts per-station per-day. Ends up\n", "# with about 340 stations times 400 days (140,000 rows). This is what we want\n", "# to predict.\n", "grouped = data.group_by([\"Days\",\"start station name\"])\n", "bpd = grouped.count().get_frame() # Compute bikes-per-day\n", "bpd.set_name(2,\"bikes\")\n", "bpd.show()\n", "bpd.describe()\n", "bpd.dim" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quantiles of bikes-per-day\n" ] }, { "data": { "text/html": [ "<table>\n", "<thead>\n", "<tr><th style=\"text-align: right;\"> Probs</th><th style=\"text-align: right;\"> bikesQuantiles</th></tr>\n", "</thead>\n", "<tbody>\n", "<tr><td style=\"text-align: right;\"> 0.01 </td><td style=\"text-align: right;\"> 5 </td></tr>\n", "<tr><td style=\"text-align: right;\"> 0.1 </td><td style=\"text-align: right;\"> 20 </td></tr>\n", "<tr><td style=\"text-align: right;\"> 0.25 </td><td style=\"text-align: right;\"> 45 </td></tr>\n", "<tr><td style=\"text-align: right;\"> 0.333</td><td style=\"text-align: right;\"> 60 </td></tr>\n", "<tr><td style=\"text-align: right;\"> 0.5 </td><td style=\"text-align: right;\"> 91 </td></tr>\n", "<tr><td style=\"text-align: right;\"> 0.667</td><td style=\"text-align: right;\"> 121 </td></tr>\n", "<tr><td style=\"text-align: right;\"> 0.75 </td><td style=\"text-align: right;\"> 141 </td></tr>\n", "<tr><td style=\"text-align: right;\"> 0.9 </td><td style=\"text-align: right;\"> 197 </td></tr>\n", "<tr><td style=\"text-align: right;\"> 0.99 </td><td style=\"text-align: right;\"> 340.4</td></tr>\n", "</tbody>\n", "</table>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Quantiles: the data is fairly unbalanced; some station/day combos are wildly\n", "# more popular than others.\n", "print(\"Quantiles of bikes-per-day\")\n", "bpd[\"bikes\"].quantile().show()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Bikes-Per-Day\n", "Rows:10131\n", "Cols:5\n", "\n", "\n" ] }, { "data": { "text/html": [ "<table>\n", "<thead>\n", "<tr><th> </th><th>Days </th><th>start station name </th><th>bikes </th><th>Month </th><th>DayOfWeek </th></tr>\n", "</thead>\n", "<tbody>\n", "<tr><td>type </td><td>int </td><td>enum </td><td>int </td><td>enum </td><td>enum </td></tr>\n", "<tr><td>mins </td><td>15979.0 </td><td> </td><td>1.0 </td><td> </td><td> </td></tr>\n", "<tr><td>mean </td><td>15993.953311617806</td><td> </td><td>102.42937518507551</td><td> </td><td> </td></tr>\n", "<tr><td>maxs </td><td>16009.0 </td><td> </td><td>603.0 </td><td> </td><td> </td></tr>\n", "<tr><td>sigma </td><td>8.950698111468864 </td><td> </td><td>74.05933443246006 </td><td> </td><td> </td></tr>\n", "<tr><td>zeros </td><td>0 </td><td> </td><td>0 </td><td> </td><td> </td></tr>\n", "<tr><td>missing</td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td></tr>\n", "<tr><td>0 </td><td>15979.0 </td><td>1 Ave & E 15 St </td><td>173.0 </td><td>10 </td><td>Tue </td></tr>\n", "<tr><td>1 </td><td>15979.0 </td><td>1 Ave & E 18 St </td><td>118.0 </td><td>10 </td><td>Tue </td></tr>\n", "<tr><td>2 </td><td>15979.0 </td><td>1 Ave & E 30 St </td><td>152.0 </td><td>10 </td><td>Tue </td></tr>\n", "<tr><td>3 </td><td>15979.0 </td><td>10 Ave & W 28 St </td><td>115.0 </td><td>10 </td><td>Tue </td></tr>\n", "<tr><td>4 </td><td>15979.0 </td><td>11 Ave & W 27 St </td><td>210.0 </td><td>10 </td><td>Tue </td></tr>\n", "<tr><td>5 </td><td>15979.0 </td><td>11 Ave & W 41 St </td><td>106.0 </td><td>10 </td><td>Tue </td></tr>\n", "<tr><td>6 </td><td>15979.0 </td><td>12 Ave & W 40 St </td><td>144.0 </td><td>10 </td><td>Tue </td></tr>\n", "<tr><td>7 </td><td>15979.0 </td><td>2 Ave & E 31 St </td><td>206.0 </td><td>10 </td><td>Tue </td></tr>\n", "<tr><td>8 </td><td>15979.0 </td><td>2 Ave & E 58 St </td><td>105.0 </td><td>10 </td><td>Tue </td></tr>\n", "<tr><td>9 </td><td>15979.0 </td><td>3 Ave & Schermerhorn St</td><td>15.0 </td><td>10 </td><td>Tue </td></tr>\n", "</tbody>\n", "</table>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# A little feature engineering\n", "# Add in month-of-year (seasonality; fewer bike rides in winter than summer)\n", "secs = bpd[\"Days\"]*secsPerDay\n", "bpd[\"Month\"] = secs.month().asfactor()\n", "# Add in day-of-week (work-week; more bike rides on Sunday than Monday)\n", "bpd[\"DayOfWeek\"] = secs.dayOfWeek()\n", "print(\"Bikes-Per-Day\")\n", "bpd.describe()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# ----------\n", "# 3- Fit a model on train; using test as validation\n", "\n", "# Function for doing class test/train/holdout split\n", "def split_fit_predict(data):\n", " global gbm0,drf0,glm0,dl0\n", " # Classic Test/Train split\n", " r = data['Days'].runif() # Random UNIForm numbers, one per row\n", " train = data[ r < 0.6]\n", " test = data[(0.6 <= r) & (r < 0.9)]\n", " hold = data[ 0.9 <= r ]\n", " print(\"Training data has\",train.ncol,\"columns and\",train.nrow,\"rows, test has\",test.nrow,\"rows, holdout has\",hold.nrow)\n", " bike_names_x = data.names\n", " bike_names_x.remove(\"bikes\")\n", " \n", " # Run GBM\n", " s = time.time()\n", " \n", " gbm0 = H2OGradientBoostingEstimator(ntrees=500, # 500 works well\n", " max_depth=6,\n", " learn_rate=0.1)\n", " \n", "\n", " gbm0.train(x =bike_names_x,\n", " y =\"bikes\",\n", " training_frame =train,\n", " validation_frame=test)\n", "\n", " gbm_elapsed = time.time() - s\n", "\n", " # Run DRF\n", " s = time.time()\n", " \n", " drf0 = H2ORandomForestEstimator(ntrees=250, max_depth=30)\n", "\n", " drf0.train(x =bike_names_x,\n", " y =\"bikes\",\n", " training_frame =train,\n", " validation_frame=test)\n", " \n", " drf_elapsed = time.time() - s \n", " \n", " \n", " # Run GLM\n", " if \"WC1\" in bike_names_x: bike_names_x.remove(\"WC1\")\n", " s = time.time()\n", "\n", " glm0 = H2OGeneralizedLinearEstimator(Lambda=[1e-5], family=\"poisson\")\n", " \n", " glm0.train(x =bike_names_x,\n", " y =\"bikes\",\n", " training_frame =train,\n", " validation_frame=test)\n", "\n", " glm_elapsed = time.time() - s\n", " \n", " # Run DL\n", " s = time.time()\n", "\n", " dl0 = H2ODeepLearningEstimator(hidden=[50,50,50,50], epochs=50)\n", " \n", " dl0.train(x =bike_names_x,\n", " y =\"bikes\",\n", " training_frame =train,\n", " validation_frame=test)\n", " \n", " dl_elapsed = time.time() - s\n", " \n", " # ----------\n", " # 4- Score on holdout set & report\n", " train_mse_gbm = gbm0.model_performance(train).mse()\n", " test_mse_gbm = gbm0.model_performance(test ).mse()\n", " hold_mse_gbm = gbm0.model_performance(hold ).mse()\n", "# print \"GBM mse TRAIN=\",train_mse_gbm,\", mse TEST=\",test_mse_gbm,\", mse HOLDOUT=\",hold_mse_gbm\n", " \n", " train_mse_drf = drf0.model_performance(train).mse()\n", " test_mse_drf = drf0.model_performance(test ).mse()\n", " hold_mse_drf = drf0.model_performance(hold ).mse()\n", "# print \"DRF mse TRAIN=\",train_mse_drf,\", mse TEST=\",test_mse_drf,\", mse HOLDOUT=\",hold_mse_drf\n", " \n", " train_mse_glm = glm0.model_performance(train).mse()\n", " test_mse_glm = glm0.model_performance(test ).mse()\n", " hold_mse_glm = glm0.model_performance(hold ).mse()\n", "# print \"GLM mse TRAIN=\",train_mse_glm,\", mse TEST=\",test_mse_glm,\", mse HOLDOUT=\",hold_mse_glm\n", " \n", " train_mse_dl = dl0.model_performance(train).mse()\n", " test_mse_dl = dl0.model_performance(test ).mse()\n", " hold_mse_dl = dl0.model_performance(hold ).mse()\n", "# print \" DL mse TRAIN=\",train_mse_dl,\", mse TEST=\",test_mse_dl,\", mse HOLDOUT=\",hold_mse_dl\n", " \n", " # make a pretty HTML table printout of the results\n", "\n", " header = [\"Model\", \"mse TRAIN\", \"mse TEST\", \"mse HOLDOUT\", \"Model Training Time (s)\"]\n", " table = [\n", " [\"GBM\", train_mse_gbm, test_mse_gbm, hold_mse_gbm, round(gbm_elapsed,3)],\n", " [\"DRF\", train_mse_drf, test_mse_drf, hold_mse_drf, round(drf_elapsed,3)],\n", " [\"GLM\", train_mse_glm, test_mse_glm, hold_mse_glm, round(glm_elapsed,3)],\n", " [\"DL \", train_mse_dl, test_mse_dl, hold_mse_dl , round(dl_elapsed,3) ],\n", " ]\n", " h2o.display.H2ODisplay(table,header)\n", " # --------------" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training data has 5 columns and 6180 rows, test has 2947 rows, holdout has 1004\n", "gbm Model Build progress: |███████████████████████████████████████████████| 100%\n", "drf Model Build progress: |███████████████████████████████████████████████| 100%\n", "glm Model Build progress: |███████████████████████████████████████████████| 100%\n", "deeplearning Model Build progress: |██████████████████████████████████████| 100%\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td><b>Model</b></td>\n", "<td><b>mse TRAIN</b></td>\n", "<td><b>mse TEST</b></td>\n", "<td><b>mse HOLDOUT</b></td>\n", "<td><b>Model Training Time (s)</b></td></tr>\n", "<tr><td>GBM</td>\n", "<td>0.8948171</td>\n", "<td>386.7584398</td>\n", "<td>428.7237120</td>\n", "<td>7.759</td></tr>\n", "<tr><td>DRF</td>\n", "<td>526.3541524</td>\n", "<td>921.4867812</td>\n", "<td>916.5091361</td>\n", "<td>8.673</td></tr>\n", "<tr><td>GLM</td>\n", "<td>689.6647078</td>\n", "<td>757.4271445</td>\n", "<td>726.9764530</td>\n", "<td>0.522</td></tr>\n", "<tr><td>DL </td>\n", "<td>307.5692122</td>\n", "<td>459.6025357</td>\n", "<td>509.2822086</td>\n", "<td>8.619</td></tr></table></div>" ], "text/plain": [ "Model mse TRAIN mse TEST mse HOLDOUT Model Training Time (s)\n", "------- ----------- ---------- ------------- -------------------------\n", "GBM 0.894817 386.758 428.724 7.759\n", "DRF 526.354 921.487 916.509 8.673\n", "GLM 689.665 757.427 726.976 0.522\n", "DL 307.569 459.603 509.282 8.619" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Split the data (into test & train), fit some models and predict on the holdout data\n", "split_fit_predict(bpd)\n", "# Here we see an r^2 of 0.91 for GBM, and 0.71 for GLM. This means given just\n", "# the station, the month, and the day-of-week we can predict 90% of the\n", "# variance of the bike-trip-starts." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Parse progress: |█████████████████████████████████████████████████████████| 100%\n", "Rows:17520\n", "Cols:50\n", "\n", "\n" ] }, { "data": { "text/html": [ "<table>\n", "<thead>\n", "<tr><th> </th><th>Year Local </th><th>Month Local </th><th>Day Local </th><th>Hour Local </th><th>Year UTC </th><th>Month UTC </th><th>Day UTC </th><th>Hour UTC </th><th>Cavok Reported </th><th>Cloud Ceiling (m) </th><th>Cloud Cover Fraction </th><th>Cloud Cover Fraction 1 </th><th>Cloud Cover Fraction 2 </th><th>Cloud Cover Fraction 3 </th><th>Cloud Cover Fraction 4 </th><th>Cloud Cover Fraction 5 </th><th>Cloud Cover Fraction 6 </th><th>Cloud Height (m) 1 </th><th>Cloud Height (m) 2 </th><th>Cloud Height (m) 3 </th><th>Cloud Height (m) 4 </th><th>Cloud Height (m) 5 </th><th>Cloud Height (m) 6 </th><th>Dew Point (C) </th><th>Humidity Fraction </th><th>Precipitation One Hour (mm) </th><th>Pressure Altimeter (mbar) </th><th>Pressure Sea Level (mbar) </th><th>Pressure Station (mbar) </th><th>Snow Depth (cm) </th><th>Temperature (C) </th><th>Visibility (km) </th><th>Weather Code 1 </th><th>Weather Code 1/ Description </th><th>Weather Code 2 </th><th>Weather Code 2/ Description </th><th>Weather Code 3 </th><th>Weather Code 3/ Description </th><th>Weather Code 4 </th><th>Weather Code 4/ Description </th><th>Weather Code 5 </th><th>Weather Code 5/ Description </th><th>Weather Code 6 </th><th>Weather Code 6/ Description </th><th>Weather Code Most Severe / Icon Code </th><th>Weather Code Most Severe </th><th>Weather Code Most Severe / Description </th><th>Wind Direction (degrees) </th><th>Wind Gust (m/s) </th><th>Wind Speed (m/s) </th></tr>\n", "</thead>\n", "<tbody>\n", "<tr><td>type </td><td>int </td><td>int </td><td>int </td><td>int </td><td>int </td><td>int </td><td>int </td><td>int </td><td>int </td><td>real </td><td>real </td><td>real </td><td>real </td><td>real </td><td>int </td><td>int </td><td>int </td><td>real </td><td>real </td><td>real </td><td>int </td><td>int </td><td>int </td><td>real </td><td>real </td><td>real </td><td>real </td><td>int </td><td>int </td><td>int </td><td>real </td><td>real </td><td>int </td><td>enum </td><td>int </td><td>enum </td><td>int </td><td>enum </td><td>int </td><td>enum </td><td>int </td><td>enum </td><td>int </td><td>enum </td><td>int </td><td>int </td><td>enum </td><td>int </td><td>real </td><td>real </td></tr>\n", "<tr><td>mins </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>0.0 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>0.0 </td><td>0.0 </td><td>61.0 </td><td>0.0 </td><td>0.0 </td><td>0.25 </td><td>0.5 </td><td>NaN </td><td>NaN </td><td>NaN </td><td>60.96 </td><td>213.36 </td><td>365.76 </td><td>NaN </td><td>NaN </td><td>NaN </td><td>-26.700000000000003</td><td>0.12510000000000002</td><td>0.0 </td><td>983.2949000000001 </td><td>NaN </td><td>NaN </td><td>NaN </td><td>-15.600000000000001</td><td>0.001 </td><td>1.0 </td><td> </td><td>1.0 </td><td> </td><td>1.0 </td><td> </td><td>1.0 </td><td> </td><td>1.0 </td><td> </td><td>3.0 </td><td> </td><td>0.0 </td><td>1.0 </td><td> </td><td>10.0 </td><td>7.2 </td><td>0.0 </td></tr>\n", "<tr><td>mean </td><td>2013.5 </td><td>6.5260273972602745</td><td>15.72054794520548</td><td>11.500000000000004</td><td>2013.5005707762557</td><td>6.525114155251141</td><td>15.72134703196347</td><td>11.500114155251142</td><td>0.0 </td><td>1306.3119584569736 </td><td>0.4167424905220181 </td><td>0.3612073490813649 </td><td>0.8724453840732911 </td><td>0.9630456852791879 </td><td>0.0 </td><td>0.0 </td><td>0.0 </td><td>1293.9822681953192 </td><td>1643.7390016566796 </td><td>2084.8938637563456 </td><td>0.0 </td><td>0.0 </td><td>0.0 </td><td>4.313046467655992 </td><td>0.5967363891594567 </td><td>1.3799301075268817 </td><td>1017.8258144055944 </td><td>0.0 </td><td>0.0 </td><td>0.0 </td><td>12.578909070073914 </td><td>14.391442968202009</td><td>4.84251968503937 </td><td> </td><td>3.6586768935762226</td><td> </td><td>2.8466076696165192</td><td> </td><td>2.0114942528735633</td><td> </td><td>4.125 </td><td> </td><td>3.0 </td><td> </td><td>1.3784817351598173 </td><td>4.84251968503937 </td><td> </td><td>194.69525681985743 </td><td>9.422169480726348 </td><td>2.4103288784874057</td></tr>\n", "<tr><td>maxs </td><td>2014.0 </td><td>12.0 </td><td>31.0 </td><td>23.0 </td><td>2015.0 </td><td>12.0 </td><td>31.0 </td><td>23.0 </td><td>0.0 </td><td>3657.6000000000004 </td><td>1.0 </td><td>1.0 </td><td>1.0 </td><td>1.0 </td><td>NaN </td><td>NaN </td><td>NaN </td><td>3657.5999 </td><td>3657.5999 </td><td>3657.5999 </td><td>NaN </td><td>NaN </td><td>NaN </td><td>24.400000000000002 </td><td>1.0 </td><td>26.924 </td><td>1042.2113 </td><td>NaN </td><td>NaN </td><td>NaN </td><td>36.1 </td><td>16.0934 </td><td>60.0 </td><td> </td><td>60.0 </td><td> </td><td>36.0 </td><td> </td><td>27.0 </td><td> </td><td>27.0 </td><td> </td><td>3.0 </td><td> </td><td>16.0 </td><td>60.0 </td><td> </td><td>360.0 </td><td>20.580000000000002</td><td>10.8 </td></tr>\n", "<tr><td>sigma </td><td>0.500014270017262</td><td>3.447949723847773 </td><td>8.796498048523272</td><td>6.922384111875021 </td><td>0.50058441171579 </td><td>3.447824054577647</td><td>8.795614888684717</td><td>6.922301652025526 </td><td>0.0 </td><td>995.3398569657211 </td><td>0.4627208309925301 </td><td>0.42770569708047684 </td><td>0.19715569036704708 </td><td>0.08610155981044185 </td><td>-0.0 </td><td>-0.0 </td><td>-0.0 </td><td>962.7430958537232 </td><td>916.7386134899587 </td><td>887.2158475113932 </td><td>-0.0 </td><td>-0.0 </td><td>-0.0 </td><td>10.973128209713666 </td><td>0.18579201186573496</td><td>2.5621512917896463 </td><td>7.464516971789659 </td><td>-0.0 </td><td>-0.0 </td><td>-0.0 </td><td>10.039673953091574 </td><td>3.6989362303340494</td><td>5.704865769828319</td><td> </td><td>6.133862539123368 </td><td> </td><td>5.805532863642112 </td><td> </td><td>3.1234084426128437</td><td> </td><td>6.15223536610881</td><td> </td><td>0.0 </td><td> </td><td>4.073860627017756 </td><td>5.704865769828319 </td><td> </td><td>106.3500000314393 </td><td>1.8151187111524154</td><td>1.614697905241178 </td></tr>\n", "<tr><td>zeros </td><td>0 </td><td>0 </td><td>0 </td><td>730 </td><td>0 </td><td>0 </td><td>0 </td><td>730 </td><td>17455 </td><td>0 </td><td>8758 </td><td>8758 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>268 </td><td>0 </td><td>501 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>269 </td><td>0 </td><td>0 </td><td> </td><td>0 </td><td> </td><td>0 </td><td> </td><td>0 </td><td> </td><td>0 </td><td> </td><td>0 </td><td> </td><td>14980 </td><td>0 </td><td> </td><td>0 </td><td>0 </td><td>2768 </td></tr>\n", "<tr><td>missing</td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>65 </td><td>10780 </td><td>375 </td><td>375 </td><td>14682 </td><td>16535 </td><td>17520 </td><td>17520 </td><td>17520 </td><td>9103 </td><td>14683 </td><td>16535 </td><td>17520 </td><td>17520 </td><td>17520 </td><td>67 </td><td>67 </td><td>15660 </td><td>360 </td><td>17520 </td><td>17520 </td><td>17520 </td><td>67 </td><td>412 </td><td>14980 </td><td>14980 </td><td>16477 </td><td>16477 </td><td>17181 </td><td>17181 </td><td>17433 </td><td>17433 </td><td>17504 </td><td>17504 </td><td>17518 </td><td>17518 </td><td>0 </td><td>14980 </td><td>14980 </td><td>9382 </td><td>14381 </td><td>1283 </td></tr>\n", "<tr><td>0 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>0.0 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>5.0 </td><td>0.0 </td><td>2895.6000000000004 </td><td>1.0 </td><td>0.9 </td><td>1.0 </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>2895.5999 </td><td>3352.8 </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>-5.0 </td><td>0.5447000000000001 </td><td>nan </td><td>1013.0917000000001 </td><td>nan </td><td>nan </td><td>nan </td><td>3.3000000000000003 </td><td>16.0934 </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>0.0 </td><td>nan </td><td> </td><td>nan </td><td>nan </td><td>2.57 </td></tr>\n", "<tr><td>1 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>1.0 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>6.0 </td><td>0.0 </td><td>3048.0 </td><td>1.0 </td><td>1.0 </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>3048.0 </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>-4.4 </td><td>0.5463 </td><td>nan </td><td>1012.0759 </td><td>nan </td><td>nan </td><td>nan </td><td>3.9000000000000004 </td><td>16.0934 </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>0.0 </td><td>nan </td><td> </td><td>260.0 </td><td>9.77 </td><td>4.63 </td></tr>\n", "<tr><td>2 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>2.0 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>7.0 </td><td>0.0 </td><td>1828.8000000000002 </td><td>1.0 </td><td>1.0 </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>1828.7999 </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>-3.3000000000000003</td><td>0.619 </td><td>nan </td><td>1012.4145000000001 </td><td>nan </td><td>nan </td><td>nan </td><td>3.3000000000000003 </td><td>16.0934 </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>0.0 </td><td>nan </td><td> </td><td>nan </td><td>7.72 </td><td>1.54 </td></tr>\n", "<tr><td>3 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>3.0 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>8.0 </td><td>0.0 </td><td>1463.0 </td><td>1.0 </td><td>1.0 </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>1463.04 </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>-2.8000000000000003</td><td>0.6159 </td><td>nan </td><td>1012.4145000000001 </td><td>nan </td><td>nan </td><td>nan </td><td>3.9000000000000004 </td><td>16.0934 </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>0.0 </td><td>nan </td><td> </td><td>nan </td><td>nan </td><td>3.09 </td></tr>\n", "<tr><td>4 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>4.0 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>9.0 </td><td>0.0 </td><td>1402.1000000000001 </td><td>1.0 </td><td>1.0 </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>1402.0800000000002 </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>-2.8000000000000003</td><td>0.6159 </td><td>nan </td><td>1012.7531 </td><td>nan </td><td>nan </td><td>nan </td><td>3.9000000000000004 </td><td>16.0934 </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>0.0 </td><td>nan </td><td> </td><td>260.0 </td><td>nan </td><td>4.12 </td></tr>\n", "<tr><td>5 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>5.0 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>10.0 </td><td>0.0 </td><td>1524.0 </td><td>1.0 </td><td>1.0 </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>1524.0 </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>-2.8000000000000003</td><td>0.6159 </td><td>nan </td><td>1012.4145000000001 </td><td>nan </td><td>nan </td><td>nan </td><td>3.9000000000000004 </td><td>16.0934 </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>0.0 </td><td>nan </td><td> </td><td>nan </td><td>nan </td><td>3.09 </td></tr>\n", "<tr><td>6 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>6.0 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>11.0 </td><td>0.0 </td><td>1524.0 </td><td>1.0 </td><td>1.0 </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>1524.0 </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>-3.3000000000000003</td><td>0.5934 </td><td>nan </td><td>1012.0759 </td><td>nan </td><td>nan </td><td>nan </td><td>3.9000000000000004 </td><td>16.0934 </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>0.0 </td><td>nan </td><td> </td><td>nan </td><td>9.26 </td><td>3.09 </td></tr>\n", "<tr><td>7 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>7.0 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>12.0 </td><td>0.0 </td><td>1524.0 </td><td>1.0 </td><td>1.0 </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>1524.0 </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>-3.3000000000000003</td><td>0.5934 </td><td>nan </td><td>1012.4145000000001 </td><td>nan </td><td>nan </td><td>nan </td><td>3.9000000000000004 </td><td>16.0934 </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>0.0 </td><td>nan </td><td> </td><td>260.0 </td><td>9.26 </td><td>4.63 </td></tr>\n", "<tr><td>8 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>8.0 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>13.0 </td><td>0.0 </td><td>1524.0 </td><td>1.0 </td><td>1.0 </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>1524.0 </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>-2.8000000000000003</td><td>0.6425000000000001 </td><td>nan </td><td>1012.4145000000001 </td><td>nan </td><td>nan </td><td>nan </td><td>3.3000000000000003 </td><td>16.0934 </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>0.0 </td><td>nan </td><td> </td><td>260.0 </td><td>nan </td><td>3.09 </td></tr>\n", "<tr><td>9 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>9.0 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>14.0 </td><td>0.0 </td><td>1524.0 </td><td>1.0 </td><td>0.9 </td><td>1.0 </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>1524.0 </td><td>3657.5999 </td><td>nan </td><td>nan </td><td>nan </td><td>nan </td><td>-2.8000000000000003</td><td>0.6159 </td><td>nan </td><td>1012.4145000000001 </td><td>nan </td><td>nan </td><td>nan </td><td>3.9000000000000004 </td><td>16.0934 </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>nan </td><td> </td><td>0.0 </td><td>nan </td><td> </td><td>nan </td><td>9.26 </td><td>3.09 </td></tr>\n", "</tbody>\n", "</table>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# ----------\n", "# 5- Now lets add some weather\n", "# Load weather data\n", "wthr1 = h2o.import_file(path=[mylocate(\"bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv\")])\n", "# Peek at the data\n", "wthr1.describe()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows:17520\n", "Cols:9\n", "\n", "\n" ] }, { "data": { "text/html": [ "<table>\n", "<thead>\n", "<tr><th> </th><th>Year Local </th><th>Month Local </th><th>Day Local </th><th>Hour Local </th><th>Dew Point (C) </th><th>Humidity Fraction </th><th>Rain (mm) </th><th>Temperature (C) </th><th>WC1 </th></tr>\n", "</thead>\n", "<tbody>\n", "<tr><td>type </td><td>int </td><td>int </td><td>int </td><td>int </td><td>real </td><td>real </td><td>real </td><td>real </td><td>enum </td></tr>\n", "<tr><td>mins </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>0.0 </td><td>-26.700000000000003</td><td>0.12510000000000002</td><td>0.0 </td><td>-15.600000000000001</td><td> </td></tr>\n", "<tr><td>mean </td><td>2013.5 </td><td>6.5260273972602745</td><td>15.72054794520548</td><td>11.500000000000004</td><td>4.313046467655992 </td><td>0.5967363891594567 </td><td>1.3799301075268817</td><td>12.578909070073914 </td><td> </td></tr>\n", "<tr><td>maxs </td><td>2014.0 </td><td>12.0 </td><td>31.0 </td><td>23.0 </td><td>24.400000000000002 </td><td>1.0 </td><td>26.924 </td><td>36.1 </td><td> </td></tr>\n", "<tr><td>sigma </td><td>0.500014270017262</td><td>3.447949723847773 </td><td>8.796498048523272</td><td>6.922384111875021 </td><td>10.973128209713666 </td><td>0.18579201186573496</td><td>2.5621512917896463</td><td>10.039673953091574 </td><td> </td></tr>\n", "<tr><td>zeros </td><td>0 </td><td>0 </td><td>0 </td><td>730 </td><td>268 </td><td>0 </td><td>501 </td><td>269 </td><td> </td></tr>\n", "<tr><td>missing</td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>67 </td><td>67 </td><td>15660 </td><td>67 </td><td>14980</td></tr>\n", "<tr><td>0 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>0.0 </td><td>-5.0 </td><td>0.5447000000000001 </td><td>nan </td><td>3.3000000000000003 </td><td> </td></tr>\n", "<tr><td>1 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>1.0 </td><td>-4.4 </td><td>0.5463 </td><td>nan </td><td>3.9000000000000004 </td><td> </td></tr>\n", "<tr><td>2 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>2.0 </td><td>-3.3000000000000003</td><td>0.619 </td><td>nan </td><td>3.3000000000000003 </td><td> </td></tr>\n", "<tr><td>3 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>3.0 </td><td>-2.8000000000000003</td><td>0.6159 </td><td>nan </td><td>3.9000000000000004 </td><td> </td></tr>\n", "<tr><td>4 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>4.0 </td><td>-2.8000000000000003</td><td>0.6159 </td><td>nan </td><td>3.9000000000000004 </td><td> </td></tr>\n", "<tr><td>5 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>5.0 </td><td>-2.8000000000000003</td><td>0.6159 </td><td>nan </td><td>3.9000000000000004 </td><td> </td></tr>\n", "<tr><td>6 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>6.0 </td><td>-3.3000000000000003</td><td>0.5934 </td><td>nan </td><td>3.9000000000000004 </td><td> </td></tr>\n", "<tr><td>7 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>7.0 </td><td>-3.3000000000000003</td><td>0.5934 </td><td>nan </td><td>3.9000000000000004 </td><td> </td></tr>\n", "<tr><td>8 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>8.0 </td><td>-2.8000000000000003</td><td>0.6425000000000001 </td><td>nan </td><td>3.3000000000000003 </td><td> </td></tr>\n", "<tr><td>9 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>9.0 </td><td>-2.8000000000000003</td><td>0.6159 </td><td>nan </td><td>3.9000000000000004 </td><td> </td></tr>\n", "</tbody>\n", "</table>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Lots of columns in there! Lets plan on converting to time-since-epoch to do\n", "# a 'join' with the bike data, plus gather weather info that might affect\n", "# cyclists - rain, snow, temperature. Alas, drop the \"snow\" column since it's\n", "# all NA's. Also add in dew point and humidity just in case. Slice out just\n", "# the columns of interest and drop the rest.\n", "wthr2 = wthr1[[\"Year Local\",\"Month Local\",\"Day Local\",\"Hour Local\",\"Dew Point (C)\",\"Humidity Fraction\",\"Precipitation One Hour (mm)\",\"Temperature (C)\",\"Weather Code 1/ Description\"]]\n", "\n", "wthr2.set_name(wthr2.names.index(\"Precipitation One Hour (mm)\"), \"Rain (mm)\")\n", "wthr2.set_name(wthr2.names.index(\"Weather Code 1/ Description\"), \"WC1\")\n", "wthr2.describe()\n", "# Much better! " ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Filter down to the weather at Noon\n", "wthr3 = wthr2[ wthr2[\"Hour Local\"]==12 ]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows:730\n", "Cols:11\n", "\n", "\n" ] }, { "data": { "text/html": [ "<table>\n", "<thead>\n", "<tr><th> </th><th>Year Local </th><th>Month Local </th><th>Day Local </th><th>Hour Local </th><th>Dew Point (C) </th><th>Humidity Fraction </th><th>Rain (mm) </th><th>Temperature (C) </th><th>WC1 </th><th>msec </th><th>Days </th></tr>\n", "</thead>\n", "<tbody>\n", "<tr><td>type </td><td>int </td><td>int </td><td>int </td><td>int </td><td>real </td><td>real </td><td>real </td><td>real </td><td>enum </td><td>int </td><td>int </td></tr>\n", "<tr><td>mins </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>12.0 </td><td>-26.700000000000003</td><td>0.1723 </td><td>0.0 </td><td>-13.9 </td><td> </td><td>1357070400000.0 </td><td>15706.0 </td></tr>\n", "<tr><td>mean </td><td>2013.5 </td><td>6.526027397260274</td><td>15.72054794520548</td><td>12.0 </td><td>4.230123796423659 </td><td>0.539728198074278 </td><td>1.5312571428571429</td><td>14.068775790921595</td><td> </td><td>1388560852602.7397</td><td>16070.5 </td></tr>\n", "<tr><td>maxs </td><td>2014.0 </td><td>12.0 </td><td>31.0 </td><td>12.0 </td><td>23.3 </td><td>1.0 </td><td>12.446 </td><td>34.4 </td><td> </td><td>1420056000000.0 </td><td>16435.0 </td></tr>\n", "<tr><td>sigma </td><td>0.5003428180039172</td><td>3.450215293068149</td><td>8.802278027009615</td><td>0.0 </td><td>11.106296472475226 </td><td>0.17994502792324327</td><td>2.3606424861490587</td><td>10.398985514891212</td><td> </td><td>18219740080.410755</td><td>210.87713642466474</td></tr>\n", "<tr><td>zeros </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>14 </td><td>0 </td><td>15 </td><td>7 </td><td> </td><td>0 </td><td>0 </td></tr>\n", "<tr><td>missing</td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>3 </td><td>3 </td><td>660 </td><td>3 </td><td>620 </td><td>0 </td><td>0 </td></tr>\n", "<tr><td>0 </td><td>2013.0 </td><td>1.0 </td><td>1.0 </td><td>12.0 </td><td>-3.3000000000000003</td><td>0.5934 </td><td>nan </td><td>3.9000000000000004</td><td> </td><td>1357070400000.0 </td><td>15706.0 </td></tr>\n", "<tr><td>1 </td><td>2013.0 </td><td>1.0 </td><td>2.0 </td><td>12.0 </td><td>-11.700000000000001</td><td>0.4806 </td><td>nan </td><td>-2.2 </td><td> </td><td>1357156800000.0 </td><td>15707.0 </td></tr>\n", "<tr><td>2 </td><td>2013.0 </td><td>1.0 </td><td>3.0 </td><td>12.0 </td><td>-10.600000000000001</td><td>0.5248 </td><td>nan </td><td>-2.2 </td><td> </td><td>1357243200000.0 </td><td>15708.0 </td></tr>\n", "<tr><td>3 </td><td>2013.0 </td><td>1.0 </td><td>4.0 </td><td>12.0 </td><td>-7.2 </td><td>0.49760000000000004</td><td>nan </td><td>2.2 </td><td> </td><td>1357329600000.0 </td><td>15709.0 </td></tr>\n", "<tr><td>4 </td><td>2013.0 </td><td>1.0 </td><td>5.0 </td><td>12.0 </td><td>-7.2 </td><td>0.42600000000000005</td><td>nan </td><td>4.4 </td><td> </td><td>1357416000000.0 </td><td>15710.0 </td></tr>\n", "<tr><td>5 </td><td>2013.0 </td><td>1.0 </td><td>6.0 </td><td>12.0 </td><td>-1.7000000000000002</td><td>0.6451 </td><td>nan </td><td>4.4 </td><td>haze </td><td>1357502400000.0 </td><td>15711.0 </td></tr>\n", "<tr><td>6 </td><td>2013.0 </td><td>1.0 </td><td>7.0 </td><td>12.0 </td><td>-6.1000000000000005</td><td>0.41190000000000004</td><td>nan </td><td>6.1000000000000005</td><td> </td><td>1357588800000.0 </td><td>15712.0 </td></tr>\n", "<tr><td>7 </td><td>2013.0 </td><td>1.0 </td><td>8.0 </td><td>12.0 </td><td>-1.7000000000000002</td><td>0.5314 </td><td>nan </td><td>7.2 </td><td> </td><td>1357675200000.0 </td><td>15713.0 </td></tr>\n", "<tr><td>8 </td><td>2013.0 </td><td>1.0 </td><td>9.0 </td><td>12.0 </td><td>0.6000000000000001 </td><td>0.56 </td><td>nan </td><td>8.9 </td><td>haze </td><td>1357761600000.0 </td><td>15714.0 </td></tr>\n", "<tr><td>9 </td><td>2013.0 </td><td>1.0 </td><td>10.0 </td><td>12.0 </td><td>-6.1000000000000005</td><td>0.3952 </td><td>nan </td><td>6.7 </td><td> </td><td>1357848000000.0 </td><td>15715.0 </td></tr>\n", "</tbody>\n", "</table>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Lets now get Days since the epoch... we'll convert year/month/day into Epoch\n", "# time, and then back to Epoch days. Need zero-based month and days, but have\n", "# 1-based.\n", "wthr3[\"msec\"] = h2o.H2OFrame.mktime(year=wthr3[\"Year Local\"], month=wthr3[\"Month Local\"]-1, day=wthr3[\"Day Local\"]-1, hour=wthr3[\"Hour Local\"])\n", "secsPerDay=1000*60*60*24\n", "wthr3[\"Days\"] = (wthr3[\"msec\"]/secsPerDay).floor()\n", "wthr3.describe()\n", "# msec looks sane (numbers like 1.3e12 are in the correct range for msec since\n", "# 1970). Epoch Days matches closely with the epoch day numbers from the\n", "# CitiBike dataset. " ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Lets drop off the extra time columns to make a easy-to-handle dataset.\n", "wthr4 = wthr3.drop(\"Year Local\").drop(\"Month Local\").drop(\"Day Local\").drop(\"Hour Local\").drop(\"msec\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Also, most rain numbers are missing - lets assume those are zero rain days\n", "rain = wthr4[\"Rain (mm)\"]\n", "rain[ rain.isna() ] = 0\n", "wthr4[\"Rain (mm)\"] = rain" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Merge Daily Weather with Bikes-Per-Day\n", "Rows:10131\n", "Cols:10\n", "\n", "\n" ] }, { "data": { "text/html": [ "<table>\n", "<thead>\n", "<tr><th> </th><th>Days </th><th>start station name </th><th>bikes </th><th>Month </th><th>DayOfWeek </th><th>Dew Point (C) </th><th>Humidity Fraction </th><th>Rain (mm) </th><th>Temperature (C) </th><th>WC1 </th></tr>\n", "</thead>\n", "<tbody>\n", "<tr><td>type </td><td>int </td><td>enum </td><td>int </td><td>enum </td><td>enum </td><td>real </td><td>real </td><td>real </td><td>real </td><td>enum </td></tr>\n", "<tr><td>mins </td><td>15979.0 </td><td> </td><td>1.0 </td><td> </td><td> </td><td>-2.2 </td><td>0.34850000000000003</td><td>0.0 </td><td>9.4 </td><td> </td></tr>\n", "<tr><td>mean </td><td>15993.953311617806</td><td> </td><td>102.42937518507551</td><td> </td><td> </td><td>7.60732405488106 </td><td>0.5564958839206396 </td><td>0.008198400947586611</td><td>16.937094067712962</td><td> </td></tr>\n", "<tr><td>maxs </td><td>16009.0 </td><td> </td><td>603.0 </td><td> </td><td> </td><td>19.400000000000002</td><td>0.8718 </td><td>0.254 </td><td>26.1 </td><td> </td></tr>\n", "<tr><td>sigma </td><td>8.950698111468864 </td><td> </td><td>74.05933443246006 </td><td> </td><td> </td><td>6.516386487040385 </td><td>0.14811201086649933</td><td>0.04489297266255909 </td><td>4.362687300129602 </td><td> </td></tr>\n", "<tr><td>zeros </td><td>0 </td><td> </td><td>0 </td><td> </td><td> </td><td>0 </td><td>0 </td><td>9804 </td><td>0 </td><td> </td></tr>\n", "<tr><td>missing</td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>0 </td><td>8816 </td></tr>\n", "<tr><td>0 </td><td>15979.0 </td><td>1 Ave & E 15 St </td><td>173.0 </td><td>10 </td><td>Tue </td><td>10.600000000000001</td><td>0.4315 </td><td>0.0 </td><td>23.900000000000002</td><td> </td></tr>\n", "<tr><td>1 </td><td>15979.0 </td><td>1 Ave & E 18 St </td><td>118.0 </td><td>10 </td><td>Tue </td><td>10.600000000000001</td><td>0.4315 </td><td>0.0 </td><td>23.900000000000002</td><td> </td></tr>\n", "<tr><td>2 </td><td>15979.0 </td><td>1 Ave & E 30 St </td><td>152.0 </td><td>10 </td><td>Tue </td><td>10.600000000000001</td><td>0.4315 </td><td>0.0 </td><td>23.900000000000002</td><td> </td></tr>\n", "<tr><td>3 </td><td>15979.0 </td><td>10 Ave & W 28 St </td><td>115.0 </td><td>10 </td><td>Tue </td><td>10.600000000000001</td><td>0.4315 </td><td>0.0 </td><td>23.900000000000002</td><td> </td></tr>\n", "<tr><td>4 </td><td>15979.0 </td><td>11 Ave & W 27 St </td><td>210.0 </td><td>10 </td><td>Tue </td><td>10.600000000000001</td><td>0.4315 </td><td>0.0 </td><td>23.900000000000002</td><td> </td></tr>\n", "<tr><td>5 </td><td>15979.0 </td><td>11 Ave & W 41 St </td><td>106.0 </td><td>10 </td><td>Tue </td><td>10.600000000000001</td><td>0.4315 </td><td>0.0 </td><td>23.900000000000002</td><td> </td></tr>\n", "<tr><td>6 </td><td>15979.0 </td><td>12 Ave & W 40 St </td><td>144.0 </td><td>10 </td><td>Tue </td><td>10.600000000000001</td><td>0.4315 </td><td>0.0 </td><td>23.900000000000002</td><td> </td></tr>\n", "<tr><td>7 </td><td>15979.0 </td><td>2 Ave & E 31 St </td><td>206.0 </td><td>10 </td><td>Tue </td><td>10.600000000000001</td><td>0.4315 </td><td>0.0 </td><td>23.900000000000002</td><td> </td></tr>\n", "<tr><td>8 </td><td>15979.0 </td><td>2 Ave & E 58 St </td><td>105.0 </td><td>10 </td><td>Tue </td><td>10.600000000000001</td><td>0.4315 </td><td>0.0 </td><td>23.900000000000002</td><td> </td></tr>\n", "<tr><td>9 </td><td>15979.0 </td><td>3 Ave & Schermerhorn St</td><td>15.0 </td><td>10 </td><td>Tue </td><td>10.600000000000001</td><td>0.4315 </td><td>0.0 </td><td>23.900000000000002</td><td> </td></tr>\n", "</tbody>\n", "</table>" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "<table>\n", "<thead>\n", "<tr><th style=\"text-align: right;\"> Days</th><th>start station name </th><th style=\"text-align: right;\"> bikes</th><th style=\"text-align: right;\"> Month</th><th>DayOfWeek </th><th style=\"text-align: right;\"> Dew Point (C)</th><th style=\"text-align: right;\"> Humidity Fraction</th><th style=\"text-align: right;\"> Rain (mm)</th><th style=\"text-align: right;\"> Temperature (C)</th><th>WC1 </th></tr>\n", "</thead>\n", "<tbody>\n", "<tr><td style=\"text-align: right;\"> 15979</td><td>1 Ave & E 15 St </td><td style=\"text-align: right;\"> 173</td><td style=\"text-align: right;\"> 10</td><td>Tue </td><td style=\"text-align: right;\"> 10.6</td><td style=\"text-align: right;\"> 0.4315</td><td style=\"text-align: right;\"> 0</td><td style=\"text-align: right;\"> 23.9</td><td> </td></tr>\n", "<tr><td style=\"text-align: right;\"> 15979</td><td>1 Ave & E 18 St </td><td style=\"text-align: right;\"> 118</td><td style=\"text-align: right;\"> 10</td><td>Tue </td><td style=\"text-align: right;\"> 10.6</td><td style=\"text-align: right;\"> 0.4315</td><td style=\"text-align: right;\"> 0</td><td style=\"text-align: right;\"> 23.9</td><td> </td></tr>\n", "<tr><td style=\"text-align: right;\"> 15979</td><td>1 Ave & E 30 St </td><td style=\"text-align: right;\"> 152</td><td style=\"text-align: right;\"> 10</td><td>Tue </td><td style=\"text-align: right;\"> 10.6</td><td style=\"text-align: right;\"> 0.4315</td><td style=\"text-align: right;\"> 0</td><td style=\"text-align: right;\"> 23.9</td><td> </td></tr>\n", "<tr><td style=\"text-align: right;\"> 15979</td><td>10 Ave & W 28 St </td><td style=\"text-align: right;\"> 115</td><td style=\"text-align: right;\"> 10</td><td>Tue </td><td style=\"text-align: right;\"> 10.6</td><td style=\"text-align: right;\"> 0.4315</td><td style=\"text-align: right;\"> 0</td><td style=\"text-align: right;\"> 23.9</td><td> </td></tr>\n", "<tr><td style=\"text-align: right;\"> 15979</td><td>11 Ave & W 27 St </td><td style=\"text-align: right;\"> 210</td><td style=\"text-align: right;\"> 10</td><td>Tue </td><td style=\"text-align: right;\"> 10.6</td><td style=\"text-align: right;\"> 0.4315</td><td style=\"text-align: right;\"> 0</td><td style=\"text-align: right;\"> 23.9</td><td> </td></tr>\n", "<tr><td style=\"text-align: right;\"> 15979</td><td>11 Ave & W 41 St </td><td style=\"text-align: right;\"> 106</td><td style=\"text-align: right;\"> 10</td><td>Tue </td><td style=\"text-align: right;\"> 10.6</td><td style=\"text-align: right;\"> 0.4315</td><td style=\"text-align: right;\"> 0</td><td style=\"text-align: right;\"> 23.9</td><td> </td></tr>\n", "<tr><td style=\"text-align: right;\"> 15979</td><td>12 Ave & W 40 St </td><td style=\"text-align: right;\"> 144</td><td style=\"text-align: right;\"> 10</td><td>Tue </td><td style=\"text-align: right;\"> 10.6</td><td style=\"text-align: right;\"> 0.4315</td><td style=\"text-align: right;\"> 0</td><td style=\"text-align: right;\"> 23.9</td><td> </td></tr>\n", "<tr><td style=\"text-align: right;\"> 15979</td><td>2 Ave & E 31 St </td><td style=\"text-align: right;\"> 206</td><td style=\"text-align: right;\"> 10</td><td>Tue </td><td style=\"text-align: right;\"> 10.6</td><td style=\"text-align: right;\"> 0.4315</td><td style=\"text-align: right;\"> 0</td><td style=\"text-align: right;\"> 23.9</td><td> </td></tr>\n", "<tr><td style=\"text-align: right;\"> 15979</td><td>2 Ave & E 58 St </td><td style=\"text-align: right;\"> 105</td><td style=\"text-align: right;\"> 10</td><td>Tue </td><td style=\"text-align: right;\"> 10.6</td><td style=\"text-align: right;\"> 0.4315</td><td style=\"text-align: right;\"> 0</td><td style=\"text-align: right;\"> 23.9</td><td> </td></tr>\n", "<tr><td style=\"text-align: right;\"> 15979</td><td>3 Ave & Schermerhorn St</td><td style=\"text-align: right;\"> 15</td><td style=\"text-align: right;\"> 10</td><td>Tue </td><td style=\"text-align: right;\"> 10.6</td><td style=\"text-align: right;\"> 0.4315</td><td style=\"text-align: right;\"> 0</td><td style=\"text-align: right;\"> 23.9</td><td> </td></tr>\n", "</tbody>\n", "</table>" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# ----------\n", "# 6 - Join the weather data-per-day to the bike-starts-per-day\n", "print(\"Merge Daily Weather with Bikes-Per-Day\")\n", "bpd_with_weather = bpd.merge(wthr4,all_x=True,all_y=False)\n", "bpd_with_weather.describe()\n", "bpd_with_weather.show()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training data has 10 columns and 6066 rows, test has 3044 rows, holdout has 1021\n", "gbm Model Build progress: |███████████████████████████████████████████████| 100%\n", "drf Model Build progress: |███████████████████████████████████████████████| 100%\n", "glm Model Build progress: |███████████████████████████████████████████████| 100%\n", "deeplearning Model Build progress: |██████████████████████████████████████| 100%\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td><b>Model</b></td>\n", "<td><b>mse TRAIN</b></td>\n", "<td><b>mse TEST</b></td>\n", "<td><b>mse HOLDOUT</b></td>\n", "<td><b>Model Training Time (s)</b></td></tr>\n", "<tr><td>GBM</td>\n", "<td>0.2159977</td>\n", "<td>393.0248269</td>\n", "<td>404.2520310</td>\n", "<td>8.679</td></tr>\n", "<tr><td>DRF</td>\n", "<td>804.2152039</td>\n", "<td>1703.1540562</td>\n", "<td>1782.0854925</td>\n", "<td>6.573</td></tr>\n", "<tr><td>GLM</td>\n", "<td>620.8814844</td>\n", "<td>735.9622856</td>\n", "<td>789.7891737</td>\n", "<td>0.241</td></tr>\n", "<tr><td>DL </td>\n", "<td>213.8582644</td>\n", "<td>454.7871732</td>\n", "<td>476.5995571</td>\n", "<td>7.518</td></tr></table></div>" ], "text/plain": [ "Model mse TRAIN mse TEST mse HOLDOUT Model Training Time (s)\n", "------- ----------- ---------- ------------- -------------------------\n", "GBM 0.215998 393.025 404.252 8.679\n", "DRF 804.215 1703.15 1782.09 6.573\n", "GLM 620.881 735.962 789.789 0.241\n", "DL 213.858 454.787 476.6 7.518" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# 7 - Test/Train split again, model build again, this time with weather\n", "split_fit_predict(bpd_with_weather)" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }