{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import h2o\n",
    "import time\n",
    "from h2o.estimators.glm import H2OGeneralizedLinearEstimator\n",
    "from h2o.estimators.gbm import H2OGradientBoostingEstimator\n",
    "from h2o.estimators.random_forest import H2ORandomForestEstimator\n",
    "from h2o.estimators.deeplearning import H2ODeepLearningEstimator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Checking whether there is an H2O instance running at http://localhost:54321. connected.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td>H2O cluster uptime:</td>\n",
       "<td>08 secs</td></tr>\n",
       "<tr><td>H2O cluster version:</td>\n",
       "<td>3.11.0.99999</td></tr>\n",
       "<tr><td>H2O cluster version age:</td>\n",
       "<td>1 minute </td></tr>\n",
       "<tr><td>H2O cluster name:</td>\n",
       "<td>pasha</td></tr>\n",
       "<tr><td>H2O cluster total nodes:</td>\n",
       "<td>1</td></tr>\n",
       "<tr><td>H2O cluster free memory:</td>\n",
       "<td>3.556 Gb</td></tr>\n",
       "<tr><td>H2O cluster total cores:</td>\n",
       "<td>8</td></tr>\n",
       "<tr><td>H2O cluster allowed cores:</td>\n",
       "<td>8</td></tr>\n",
       "<tr><td>H2O cluster status:</td>\n",
       "<td>accepting new members, healthy</td></tr>\n",
       "<tr><td>H2O connection url:</td>\n",
       "<td>http://localhost:54321</td></tr>\n",
       "<tr><td>H2O connection proxy:</td>\n",
       "<td>None</td></tr>\n",
       "<tr><td>Python version:</td>\n",
       "<td>3.5.2 final</td></tr></table></div>"
      ],
      "text/plain": [
       "--------------------------  ------------------------------\n",
       "H2O cluster uptime:         08 secs\n",
       "H2O cluster version:        3.11.0.99999\n",
       "H2O cluster version age:    1 minute\n",
       "H2O cluster name:           pasha\n",
       "H2O cluster total nodes:    1\n",
       "H2O cluster free memory:    3.556 Gb\n",
       "H2O cluster total cores:    8\n",
       "H2O cluster allowed cores:  8\n",
       "H2O cluster status:         accepting new members, healthy\n",
       "H2O connection url:         http://localhost:54321\n",
       "H2O connection proxy:\n",
       "Python version:             3.5.2 final\n",
       "--------------------------  ------------------------------"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Explore a typical Data Science workflow with H2O and Python\n",
    "#\n",
    "# Goal: assist the manager of CitiBike of NYC to load-balance the bicycles\n",
    "# across the CitiBike network of stations, by predicting the number of bike\n",
    "# trips taken from the station every day.  Use 10 million rows of historical\n",
    "# data, and eventually add weather data.\n",
    "\n",
    "\n",
    "# Connect to a cluster\n",
    "h2o.init()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.\n",
    "\n",
    "# Set this to True if you want to fetch the data directly from S3.\n",
    "# This is useful if your cluster is running in EC2.\n",
    "data_source_is_s3 = False\n",
    "\n",
    "def mylocate(s):\n",
    "    if data_source_is_s3:\n",
    "        return \"s3n://h2o-public-test-data/\" + s\n",
    "    else:\n",
    "        return _locate(s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Import and Parse bike data\n",
      "Parse progress: |█████████████████████████████████████████████████████████| 100%\n"
     ]
    }
   ],
   "source": [
    "# Pick either the big or the small demo.\n",
    "# Big data is 10M rows\n",
    "small_test = [mylocate(\"bigdata/laptop/citibike-nyc/2013-10.csv\")]\n",
    "big_test =   [mylocate(\"bigdata/laptop/citibike-nyc/2013-07.csv\"),\n",
    "              mylocate(\"bigdata/laptop/citibike-nyc/2013-08.csv\"),\n",
    "              mylocate(\"bigdata/laptop/citibike-nyc/2013-09.csv\"),\n",
    "              mylocate(\"bigdata/laptop/citibike-nyc/2013-10.csv\"),\n",
    "              mylocate(\"bigdata/laptop/citibike-nyc/2013-11.csv\"),\n",
    "              mylocate(\"bigdata/laptop/citibike-nyc/2013-12.csv\"),\n",
    "              mylocate(\"bigdata/laptop/citibike-nyc/2014-01.csv\"),\n",
    "              mylocate(\"bigdata/laptop/citibike-nyc/2014-02.csv\"),\n",
    "              mylocate(\"bigdata/laptop/citibike-nyc/2014-03.csv\"),\n",
    "              mylocate(\"bigdata/laptop/citibike-nyc/2014-04.csv\"),\n",
    "              mylocate(\"bigdata/laptop/citibike-nyc/2014-05.csv\"),\n",
    "              mylocate(\"bigdata/laptop/citibike-nyc/2014-06.csv\"),\n",
    "              mylocate(\"bigdata/laptop/citibike-nyc/2014-07.csv\"),\n",
    "              mylocate(\"bigdata/laptop/citibike-nyc/2014-08.csv\")]\n",
    "\n",
    "# ----------\n",
    "\n",
    "# 1- Load data - 1 row per bicycle trip.  Has columns showing the start and end\n",
    "# station, trip duration and trip start time and day.  The larger dataset\n",
    "# totals about 10 million rows\n",
    "print(\"Import and Parse bike data\")\n",
    "data = h2o.import_file(path=small_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Rows:1037712\n",
      "Cols:16\n",
      "\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead>\n",
       "<tr><th>       </th><th>tripduration      </th><th>starttime          </th><th>stoptime           </th><th>start station id  </th><th>start station name      </th><th>start station latitude  </th><th>start station longitude  </th><th>end station id    </th><th>end station name              </th><th>end station latitude  </th><th>end station longitude  </th><th>bikeid            </th><th>usertype  </th><th>birth year        </th><th>gender           </th><th>Days              </th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "<tr><td>type   </td><td>int               </td><td>time               </td><td>time               </td><td>int               </td><td>enum                    </td><td>real                    </td><td>real                     </td><td>int               </td><td>enum                          </td><td>real                  </td><td>real                   </td><td>int               </td><td>enum      </td><td>int               </td><td>int              </td><td>int               </td></tr>\n",
       "<tr><td>mins   </td><td>60.0              </td><td>1380585668000.0    </td><td>1380585883000.0    </td><td>72.0              </td><td>                        </td><td>40.680342423            </td><td>-74.01713445             </td><td>72.0              </td><td>                              </td><td>40.680342423          </td><td>-74.01713445           </td><td>14529.0           </td><td>          </td><td>1899.0            </td><td>0.0              </td><td>15979.0           </td></tr>\n",
       "<tr><td>mean   </td><td>825.6147543827192 </td><td>1381888516917.714  </td><td>1381889342532.4746 </td><td>443.7142126139049 </td><td>                        </td><td>40.73451885864454       </td><td>-73.99113288482197       </td><td>443.20742171238254</td><td>                              </td><td>40.73428478848875     </td><td>-73.99127029824423     </td><td>17644.071645119242</td><td>          </td><td>1975.7783948601839</td><td>1.123755916863252</td><td>15993.476745956474</td></tr>\n",
       "<tr><td>maxs   </td><td>1259480.0         </td><td>1383263997000.0    </td><td>1383393310000.0    </td><td>3002.0            </td><td>                        </td><td>40.770513               </td><td>-73.9500479759           </td><td>3002.0            </td><td>                              </td><td>40.770513             </td><td>-73.9500479759         </td><td>20757.0           </td><td>          </td><td>1997.0            </td><td>2.0              </td><td>16009.0           </td></tr>\n",
       "<tr><td>sigma  </td><td>2000.3732322961862</td><td>778871729.1323168  </td><td>778847387.5037588  </td><td>354.43432507453724</td><td>                        </td><td>0.01957340730530415     </td><td>0.012316123410581171     </td><td>357.39821705755827</td><td>                              </td><td>0.019557845811587957  </td><td>0.012385581196537298   </td><td>1717.6811213447866</td><td>          </td><td>11.131490623834942</td><td>0.544380593291009</td><td>9.014533519116712 </td></tr>\n",
       "<tr><td>zeros  </td><td>0                 </td><td>0                  </td><td>0                  </td><td>0                 </td><td>                        </td><td>0                       </td><td>0                        </td><td>0                 </td><td>                              </td><td>0                     </td><td>0                      </td><td>0                 </td><td>          </td><td>0                 </td><td>97498            </td><td>0                 </td></tr>\n",
       "<tr><td>missing</td><td>0                 </td><td>0                  </td><td>0                  </td><td>0                 </td><td>0                       </td><td>0                       </td><td>0                        </td><td>0                 </td><td>0                             </td><td>0                     </td><td>0                      </td><td>0                 </td><td>0         </td><td>97445             </td><td>0                </td><td>0                 </td></tr>\n",
       "<tr><td>0      </td><td>326.0             </td><td>2013-10-01 00:01:08</td><td>2013-10-01 00:06:34</td><td>239.0             </td><td>Willoughby St & Fleet St</td><td>40.69196566             </td><td>-73.9813018              </td><td>366.0             </td><td>Clinton Ave & Myrtle Ave      </td><td>40.693261             </td><td>-73.968896             </td><td>16052.0           </td><td>Subscriber</td><td>1982.0            </td><td>1.0              </td><td>15979.0           </td></tr>\n",
       "<tr><td>1      </td><td>729.0             </td><td>2013-10-01 00:01:21</td><td>2013-10-01 00:13:30</td><td>322.0             </td><td>Clinton St & Tillary St </td><td>40.696191999999996      </td><td>-73.991218               </td><td>398.0             </td><td>Atlantic Ave & Furman St      </td><td>40.69165183           </td><td>-73.99997859999999     </td><td>19412.0           </td><td>Customer  </td><td>nan               </td><td>0.0              </td><td>15979.0           </td></tr>\n",
       "<tr><td>2      </td><td>520.0             </td><td>2013-10-01 00:01:24</td><td>2013-10-01 00:10:04</td><td>174.0             </td><td>E 25 St & 1 Ave         </td><td>40.7381765              </td><td>-73.97738662             </td><td>403.0             </td><td>E 2 St & 2 Ave                </td><td>40.72502876           </td><td>-73.99069656           </td><td>19645.0           </td><td>Subscriber</td><td>1984.0            </td><td>1.0              </td><td>15979.0           </td></tr>\n",
       "<tr><td>3      </td><td>281.0             </td><td>2013-10-01 00:01:25</td><td>2013-10-01 00:06:06</td><td>430.0             </td><td>York St & Jay St        </td><td>40.7014851              </td><td>-73.98656928             </td><td>323.0             </td><td>Lawrence St & Willoughby St   </td><td>40.69236178           </td><td>-73.98631746           </td><td>16992.0           </td><td>Subscriber</td><td>1985.0            </td><td>1.0              </td><td>15979.0           </td></tr>\n",
       "<tr><td>4      </td><td>196.0             </td><td>2013-10-01 00:01:27</td><td>2013-10-01 00:04:43</td><td>403.0             </td><td>E 2 St & 2 Ave          </td><td>40.72502876             </td><td>-73.99069656             </td><td>401.0             </td><td>Allen St & Rivington St       </td><td>40.72019576           </td><td>-73.98997825000001     </td><td>15690.0           </td><td>Subscriber</td><td>1986.0            </td><td>1.0              </td><td>15979.0           </td></tr>\n",
       "<tr><td>5      </td><td>1948.0            </td><td>2013-10-01 00:01:48</td><td>2013-10-01 00:34:16</td><td>369.0             </td><td>Washington Pl & 6 Ave   </td><td>40.73224119             </td><td>-74.00026394             </td><td>307.0             </td><td>Canal St & Rutgers St         </td><td>40.714274870000004    </td><td>-73.98990025           </td><td>19846.0           </td><td>Subscriber</td><td>1977.0            </td><td>1.0              </td><td>15979.0           </td></tr>\n",
       "<tr><td>6      </td><td>1327.0            </td><td>2013-10-01 00:01:48</td><td>2013-10-01 00:23:55</td><td>254.0             </td><td>W 11 St & 6 Ave         </td><td>40.73532427             </td><td>-73.99800419             </td><td>539.0             </td><td>Metropolitan Ave & Bedford Ave</td><td>40.71534825           </td><td>-73.96024116           </td><td>14563.0           </td><td>Subscriber</td><td>1986.0            </td><td>2.0              </td><td>15979.0           </td></tr>\n",
       "<tr><td>7      </td><td>1146.0            </td><td>2013-10-01 00:01:57</td><td>2013-10-01 00:21:03</td><td>490.0             </td><td>8 Ave & W 33 St         </td><td>40.751551               </td><td>-73.993934               </td><td>438.0             </td><td>St Marks Pl & 1 Ave           </td><td>40.727791260000004    </td><td>-73.98564945           </td><td>16793.0           </td><td>Subscriber</td><td>1959.0            </td><td>1.0              </td><td>15979.0           </td></tr>\n",
       "<tr><td>8      </td><td>380.0             </td><td>2013-10-01 00:01:58</td><td>2013-10-01 00:08:18</td><td>468.0             </td><td>Broadway & W 55 St      </td><td>40.7652654              </td><td>-73.98192338             </td><td>385.0             </td><td>E 55 St & 2 Ave               </td><td>40.757973220000004    </td><td>-73.96603308           </td><td>16600.0           </td><td>Customer  </td><td>nan               </td><td>0.0              </td><td>15979.0           </td></tr>\n",
       "<tr><td>9      </td><td>682.0             </td><td>2013-10-01 00:02:05</td><td>2013-10-01 00:13:27</td><td>300.0             </td><td>Shevchenko Pl & E 6 St  </td><td>40.728145               </td><td>-73.990214               </td><td>519.0             </td><td>Pershing Square N             </td><td>40.75188406           </td><td>-73.97770164           </td><td>15204.0           </td><td>Subscriber</td><td>1992.0            </td><td>1.0              </td><td>15979.0           </td></tr>\n",
       "</tbody>\n",
       "</table>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# ----------\n",
    "\n",
    "# 2- light data munging: group the bike starts per-day, converting the 10M rows\n",
    "# of trips to about 140,000 station&day combos - predicting the number of trip\n",
    "# starts per-station-per-day.\n",
    "\n",
    "# Convert start time to: Day since the Epoch\n",
    "startime = data[\"starttime\"]\n",
    "secsPerDay = 1000 * 3600 * 24\n",
    "data[\"Days\"] = (startime.asnumeric() / secsPerDay).floor()\n",
    "data.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead>\n",
       "<tr><th style=\"text-align: right;\">  Days</th><th>start station name     </th><th style=\"text-align: right;\">  bikes</th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "<tr><td style=\"text-align: right;\"> 15979</td><td>1 Ave & E 15 St        </td><td style=\"text-align: right;\">    173</td></tr>\n",
       "<tr><td style=\"text-align: right;\"> 15979</td><td>1 Ave & E 18 St        </td><td style=\"text-align: right;\">    118</td></tr>\n",
       "<tr><td style=\"text-align: right;\"> 15979</td><td>1 Ave & E 30 St        </td><td style=\"text-align: right;\">    152</td></tr>\n",
       "<tr><td style=\"text-align: right;\"> 15979</td><td>10 Ave & W 28 St       </td><td style=\"text-align: right;\">    115</td></tr>\n",
       "<tr><td style=\"text-align: right;\"> 15979</td><td>11 Ave & W 27 St       </td><td style=\"text-align: right;\">    210</td></tr>\n",
       "<tr><td style=\"text-align: right;\"> 15979</td><td>11 Ave & W 41 St       </td><td style=\"text-align: right;\">    106</td></tr>\n",
       "<tr><td style=\"text-align: right;\"> 15979</td><td>12 Ave & W 40 St       </td><td style=\"text-align: right;\">    144</td></tr>\n",
       "<tr><td style=\"text-align: right;\"> 15979</td><td>2 Ave & E 31 St        </td><td style=\"text-align: right;\">    206</td></tr>\n",
       "<tr><td style=\"text-align: right;\"> 15979</td><td>2 Ave & E 58 St        </td><td style=\"text-align: right;\">    105</td></tr>\n",
       "<tr><td style=\"text-align: right;\"> 15979</td><td>3 Ave & Schermerhorn St</td><td style=\"text-align: right;\">     15</td></tr>\n",
       "</tbody>\n",
       "</table>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Rows:10131\n",
      "Cols:3\n",
      "\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead>\n",
       "<tr><th>       </th><th>Days              </th><th>start station name     </th><th>bikes             </th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "<tr><td>type   </td><td>int               </td><td>enum                   </td><td>int               </td></tr>\n",
       "<tr><td>mins   </td><td>15979.0           </td><td>                       </td><td>1.0               </td></tr>\n",
       "<tr><td>mean   </td><td>15993.953311617806</td><td>                       </td><td>102.42937518507551</td></tr>\n",
       "<tr><td>maxs   </td><td>16009.0           </td><td>                       </td><td>603.0             </td></tr>\n",
       "<tr><td>sigma  </td><td>8.950698111468864 </td><td>                       </td><td>74.05933443246006 </td></tr>\n",
       "<tr><td>zeros  </td><td>0                 </td><td>                       </td><td>0                 </td></tr>\n",
       "<tr><td>missing</td><td>0                 </td><td>0                      </td><td>0                 </td></tr>\n",
       "<tr><td>0      </td><td>15979.0           </td><td>1 Ave & E 15 St        </td><td>173.0             </td></tr>\n",
       "<tr><td>1      </td><td>15979.0           </td><td>1 Ave & E 18 St        </td><td>118.0             </td></tr>\n",
       "<tr><td>2      </td><td>15979.0           </td><td>1 Ave & E 30 St        </td><td>152.0             </td></tr>\n",
       "<tr><td>3      </td><td>15979.0           </td><td>10 Ave & W 28 St       </td><td>115.0             </td></tr>\n",
       "<tr><td>4      </td><td>15979.0           </td><td>11 Ave & W 27 St       </td><td>210.0             </td></tr>\n",
       "<tr><td>5      </td><td>15979.0           </td><td>11 Ave & W 41 St       </td><td>106.0             </td></tr>\n",
       "<tr><td>6      </td><td>15979.0           </td><td>12 Ave & W 40 St       </td><td>144.0             </td></tr>\n",
       "<tr><td>7      </td><td>15979.0           </td><td>2 Ave & E 31 St        </td><td>206.0             </td></tr>\n",
       "<tr><td>8      </td><td>15979.0           </td><td>2 Ave & E 58 St        </td><td>105.0             </td></tr>\n",
       "<tr><td>9      </td><td>15979.0           </td><td>3 Ave & Schermerhorn St</td><td>15.0              </td></tr>\n",
       "</tbody>\n",
       "</table>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "[10131, 3]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Now do a monster Group-By.  Count bike starts per-station per-day.  Ends up\n",
    "# with about 340 stations times 400 days (140,000 rows).  This is what we want\n",
    "# to predict.\n",
    "grouped = data.group_by([\"Days\",\"start station name\"])\n",
    "bpd = grouped.count().get_frame() # Compute bikes-per-day\n",
    "bpd.set_name(2,\"bikes\")\n",
    "bpd.show()\n",
    "bpd.describe()\n",
    "bpd.dim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quantiles of bikes-per-day\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead>\n",
       "<tr><th style=\"text-align: right;\">  Probs</th><th style=\"text-align: right;\">  bikesQuantiles</th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "<tr><td style=\"text-align: right;\">  0.01 </td><td style=\"text-align: right;\">             5  </td></tr>\n",
       "<tr><td style=\"text-align: right;\">  0.1  </td><td style=\"text-align: right;\">            20  </td></tr>\n",
       "<tr><td style=\"text-align: right;\">  0.25 </td><td style=\"text-align: right;\">            45  </td></tr>\n",
       "<tr><td style=\"text-align: right;\">  0.333</td><td style=\"text-align: right;\">            60  </td></tr>\n",
       "<tr><td style=\"text-align: right;\">  0.5  </td><td style=\"text-align: right;\">            91  </td></tr>\n",
       "<tr><td style=\"text-align: right;\">  0.667</td><td style=\"text-align: right;\">           121  </td></tr>\n",
       "<tr><td style=\"text-align: right;\">  0.75 </td><td style=\"text-align: right;\">           141  </td></tr>\n",
       "<tr><td style=\"text-align: right;\">  0.9  </td><td style=\"text-align: right;\">           197  </td></tr>\n",
       "<tr><td style=\"text-align: right;\">  0.99 </td><td style=\"text-align: right;\">           340.4</td></tr>\n",
       "</tbody>\n",
       "</table>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Quantiles: the data is fairly unbalanced; some station/day combos are wildly\n",
    "# more popular than others.\n",
    "print(\"Quantiles of bikes-per-day\")\n",
    "bpd[\"bikes\"].quantile().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bikes-Per-Day\n",
      "Rows:10131\n",
      "Cols:5\n",
      "\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead>\n",
       "<tr><th>       </th><th>Days              </th><th>start station name     </th><th>bikes             </th><th>Month  </th><th>DayOfWeek  </th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "<tr><td>type   </td><td>int               </td><td>enum                   </td><td>int               </td><td>enum   </td><td>enum       </td></tr>\n",
       "<tr><td>mins   </td><td>15979.0           </td><td>                       </td><td>1.0               </td><td>       </td><td>           </td></tr>\n",
       "<tr><td>mean   </td><td>15993.953311617806</td><td>                       </td><td>102.42937518507551</td><td>       </td><td>           </td></tr>\n",
       "<tr><td>maxs   </td><td>16009.0           </td><td>                       </td><td>603.0             </td><td>       </td><td>           </td></tr>\n",
       "<tr><td>sigma  </td><td>8.950698111468864 </td><td>                       </td><td>74.05933443246006 </td><td>       </td><td>           </td></tr>\n",
       "<tr><td>zeros  </td><td>0                 </td><td>                       </td><td>0                 </td><td>       </td><td>           </td></tr>\n",
       "<tr><td>missing</td><td>0                 </td><td>0                      </td><td>0                 </td><td>0      </td><td>0          </td></tr>\n",
       "<tr><td>0      </td><td>15979.0           </td><td>1 Ave & E 15 St        </td><td>173.0             </td><td>10     </td><td>Tue        </td></tr>\n",
       "<tr><td>1      </td><td>15979.0           </td><td>1 Ave & E 18 St        </td><td>118.0             </td><td>10     </td><td>Tue        </td></tr>\n",
       "<tr><td>2      </td><td>15979.0           </td><td>1 Ave & E 30 St        </td><td>152.0             </td><td>10     </td><td>Tue        </td></tr>\n",
       "<tr><td>3      </td><td>15979.0           </td><td>10 Ave & W 28 St       </td><td>115.0             </td><td>10     </td><td>Tue        </td></tr>\n",
       "<tr><td>4      </td><td>15979.0           </td><td>11 Ave & W 27 St       </td><td>210.0             </td><td>10     </td><td>Tue        </td></tr>\n",
       "<tr><td>5      </td><td>15979.0           </td><td>11 Ave & W 41 St       </td><td>106.0             </td><td>10     </td><td>Tue        </td></tr>\n",
       "<tr><td>6      </td><td>15979.0           </td><td>12 Ave & W 40 St       </td><td>144.0             </td><td>10     </td><td>Tue        </td></tr>\n",
       "<tr><td>7      </td><td>15979.0           </td><td>2 Ave & E 31 St        </td><td>206.0             </td><td>10     </td><td>Tue        </td></tr>\n",
       "<tr><td>8      </td><td>15979.0           </td><td>2 Ave & E 58 St        </td><td>105.0             </td><td>10     </td><td>Tue        </td></tr>\n",
       "<tr><td>9      </td><td>15979.0           </td><td>3 Ave & Schermerhorn St</td><td>15.0              </td><td>10     </td><td>Tue        </td></tr>\n",
       "</tbody>\n",
       "</table>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# A little feature engineering\n",
    "# Add in month-of-year (seasonality; fewer bike rides in winter than summer)\n",
    "secs = bpd[\"Days\"]*secsPerDay\n",
    "bpd[\"Month\"]     = secs.month().asfactor()\n",
    "# Add in day-of-week (work-week; more bike rides on Sunday than Monday)\n",
    "bpd[\"DayOfWeek\"] = secs.dayOfWeek()\n",
    "print(\"Bikes-Per-Day\")\n",
    "bpd.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# ----------\n",
    "# 3- Fit a model on train; using test as validation\n",
    "\n",
    "# Function for doing class test/train/holdout split\n",
    "def split_fit_predict(data):\n",
    "  global gbm0,drf0,glm0,dl0\n",
    "  # Classic Test/Train split\n",
    "  r = data['Days'].runif()   # Random UNIForm numbers, one per row\n",
    "  train = data[  r  < 0.6]\n",
    "  test  = data[(0.6 <= r) & (r < 0.9)]\n",
    "  hold  = data[ 0.9 <= r ]\n",
    "  print(\"Training data has\",train.ncol,\"columns and\",train.nrow,\"rows, test has\",test.nrow,\"rows, holdout has\",hold.nrow)\n",
    "  bike_names_x = data.names\n",
    "  bike_names_x.remove(\"bikes\")\n",
    "  \n",
    "  # Run GBM\n",
    "  s = time.time()\n",
    "  \n",
    "  gbm0 = H2OGradientBoostingEstimator(ntrees=500, # 500 works well\n",
    "                                      max_depth=6,\n",
    "                                      learn_rate=0.1)\n",
    "    \n",
    "\n",
    "  gbm0.train(x               =bike_names_x,\n",
    "             y               =\"bikes\",\n",
    "             training_frame  =train,\n",
    "             validation_frame=test)\n",
    "\n",
    "  gbm_elapsed = time.time() - s\n",
    "\n",
    "  # Run DRF\n",
    "  s = time.time()\n",
    "    \n",
    "  drf0 = H2ORandomForestEstimator(ntrees=250, max_depth=30)\n",
    "\n",
    "  drf0.train(x               =bike_names_x,\n",
    "             y               =\"bikes\",\n",
    "             training_frame  =train,\n",
    "             validation_frame=test)\n",
    "    \n",
    "  drf_elapsed = time.time() - s \n",
    "    \n",
    "    \n",
    "  # Run GLM\n",
    "  if \"WC1\" in bike_names_x: bike_names_x.remove(\"WC1\")\n",
    "  s = time.time()\n",
    "\n",
    "  glm0 = H2OGeneralizedLinearEstimator(Lambda=[1e-5], family=\"poisson\")\n",
    "    \n",
    "  glm0.train(x               =bike_names_x,\n",
    "             y               =\"bikes\",\n",
    "             training_frame  =train,\n",
    "             validation_frame=test)\n",
    "\n",
    "  glm_elapsed = time.time() - s\n",
    "  \n",
    "  # Run DL\n",
    "  s = time.time()\n",
    "\n",
    "  dl0 = H2ODeepLearningEstimator(hidden=[50,50,50,50], epochs=50)\n",
    "    \n",
    "  dl0.train(x               =bike_names_x,\n",
    "            y               =\"bikes\",\n",
    "            training_frame  =train,\n",
    "            validation_frame=test)\n",
    "    \n",
    "  dl_elapsed = time.time() - s\n",
    "  \n",
    "  # ----------\n",
    "  # 4- Score on holdout set & report\n",
    "  train_mse_gbm = gbm0.model_performance(train).mse()\n",
    "  test_mse_gbm  = gbm0.model_performance(test ).mse()\n",
    "  hold_mse_gbm  = gbm0.model_performance(hold ).mse()\n",
    "#   print \"GBM mse TRAIN=\",train_mse_gbm,\", mse TEST=\",test_mse_gbm,\", mse HOLDOUT=\",hold_mse_gbm\n",
    "  \n",
    "  train_mse_drf = drf0.model_performance(train).mse()\n",
    "  test_mse_drf  = drf0.model_performance(test ).mse()\n",
    "  hold_mse_drf  = drf0.model_performance(hold ).mse()\n",
    "#   print \"DRF mse TRAIN=\",train_mse_drf,\", mse TEST=\",test_mse_drf,\", mse HOLDOUT=\",hold_mse_drf\n",
    "  \n",
    "  train_mse_glm = glm0.model_performance(train).mse()\n",
    "  test_mse_glm  = glm0.model_performance(test ).mse()\n",
    "  hold_mse_glm  = glm0.model_performance(hold ).mse()\n",
    "#   print \"GLM mse TRAIN=\",train_mse_glm,\", mse TEST=\",test_mse_glm,\", mse HOLDOUT=\",hold_mse_glm\n",
    "    \n",
    "  train_mse_dl = dl0.model_performance(train).mse()\n",
    "  test_mse_dl  = dl0.model_performance(test ).mse()\n",
    "  hold_mse_dl  = dl0.model_performance(hold ).mse()\n",
    "#   print \" DL mse TRAIN=\",train_mse_dl,\", mse TEST=\",test_mse_dl,\", mse HOLDOUT=\",hold_mse_dl\n",
    "    \n",
    "  # make a pretty HTML table printout of the results\n",
    "\n",
    "  header = [\"Model\", \"mse TRAIN\", \"mse TEST\", \"mse HOLDOUT\", \"Model Training Time (s)\"]\n",
    "  table  = [\n",
    "            [\"GBM\", train_mse_gbm, test_mse_gbm, hold_mse_gbm, round(gbm_elapsed,3)],\n",
    "            [\"DRF\", train_mse_drf, test_mse_drf, hold_mse_drf, round(drf_elapsed,3)],\n",
    "            [\"GLM\", train_mse_glm, test_mse_glm, hold_mse_glm, round(glm_elapsed,3)],\n",
    "            [\"DL \", train_mse_dl,  test_mse_dl,  hold_mse_dl , round(dl_elapsed,3) ],\n",
    "           ]\n",
    "  h2o.display.H2ODisplay(table,header)\n",
    "  # --------------"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training data has 5 columns and 6180 rows, test has 2947 rows, holdout has 1004\n",
      "gbm Model Build progress: |███████████████████████████████████████████████| 100%\n",
      "drf Model Build progress: |███████████████████████████████████████████████| 100%\n",
      "glm Model Build progress: |███████████████████████████████████████████████| 100%\n",
      "deeplearning Model Build progress: |██████████████████████████████████████| 100%\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td><b>Model</b></td>\n",
       "<td><b>mse TRAIN</b></td>\n",
       "<td><b>mse TEST</b></td>\n",
       "<td><b>mse HOLDOUT</b></td>\n",
       "<td><b>Model Training Time (s)</b></td></tr>\n",
       "<tr><td>GBM</td>\n",
       "<td>0.8948171</td>\n",
       "<td>386.7584398</td>\n",
       "<td>428.7237120</td>\n",
       "<td>7.759</td></tr>\n",
       "<tr><td>DRF</td>\n",
       "<td>526.3541524</td>\n",
       "<td>921.4867812</td>\n",
       "<td>916.5091361</td>\n",
       "<td>8.673</td></tr>\n",
       "<tr><td>GLM</td>\n",
       "<td>689.6647078</td>\n",
       "<td>757.4271445</td>\n",
       "<td>726.9764530</td>\n",
       "<td>0.522</td></tr>\n",
       "<tr><td>DL </td>\n",
       "<td>307.5692122</td>\n",
       "<td>459.6025357</td>\n",
       "<td>509.2822086</td>\n",
       "<td>8.619</td></tr></table></div>"
      ],
      "text/plain": [
       "Model      mse TRAIN    mse TEST    mse HOLDOUT    Model Training Time (s)\n",
       "-------  -----------  ----------  -------------  -------------------------\n",
       "GBM         0.894817     386.758        428.724                      7.759\n",
       "DRF       526.354        921.487        916.509                      8.673\n",
       "GLM       689.665        757.427        726.976                      0.522\n",
       "DL        307.569        459.603        509.282                      8.619"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Split the data (into test & train), fit some models and predict on the holdout data\n",
    "split_fit_predict(bpd)\n",
    "# Here we see an r^2 of 0.91 for GBM, and 0.71 for GLM.  This means given just\n",
    "# the station, the month, and the day-of-week we can predict 90% of the\n",
    "# variance of the bike-trip-starts."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Parse progress: |█████████████████████████████████████████████████████████| 100%\n",
      "Rows:17520\n",
      "Cols:50\n",
      "\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead>\n",
       "<tr><th>       </th><th>Year Local       </th><th>Month Local       </th><th>Day Local        </th><th>Hour Local        </th><th>Year UTC          </th><th>Month UTC        </th><th>Day UTC          </th><th>Hour UTC          </th><th>Cavok Reported  </th><th>Cloud Ceiling (m)  </th><th>Cloud Cover Fraction  </th><th>Cloud Cover Fraction 1  </th><th>Cloud Cover Fraction 2  </th><th>Cloud Cover Fraction 3  </th><th>Cloud Cover Fraction 4  </th><th>Cloud Cover Fraction 5  </th><th>Cloud Cover Fraction 6  </th><th>Cloud Height (m) 1  </th><th>Cloud Height (m) 2  </th><th>Cloud Height (m) 3  </th><th>Cloud Height (m) 4  </th><th>Cloud Height (m) 5  </th><th>Cloud Height (m) 6  </th><th>Dew Point (C)      </th><th>Humidity Fraction  </th><th>Precipitation One Hour (mm)  </th><th>Pressure Altimeter (mbar)  </th><th>Pressure Sea Level (mbar)  </th><th>Pressure Station (mbar)  </th><th>Snow Depth (cm)  </th><th>Temperature (C)    </th><th>Visibility (km)   </th><th>Weather Code 1   </th><th>Weather Code 1/ Description  </th><th>Weather Code 2    </th><th>Weather Code 2/ Description  </th><th>Weather Code 3    </th><th>Weather Code 3/ Description  </th><th>Weather Code 4    </th><th>Weather Code 4/ Description  </th><th>Weather Code 5  </th><th>Weather Code 5/ Description  </th><th>Weather Code 6  </th><th>Weather Code 6/ Description  </th><th>Weather Code Most Severe / Icon Code  </th><th>Weather Code Most Severe  </th><th>Weather Code Most Severe / Description  </th><th>Wind Direction (degrees)  </th><th>Wind Gust (m/s)   </th><th>Wind Speed (m/s)  </th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "<tr><td>type   </td><td>int              </td><td>int               </td><td>int              </td><td>int               </td><td>int               </td><td>int              </td><td>int              </td><td>int               </td><td>int             </td><td>real               </td><td>real                  </td><td>real                    </td><td>real                    </td><td>real                    </td><td>int                     </td><td>int                     </td><td>int                     </td><td>real                </td><td>real                </td><td>real                </td><td>int                 </td><td>int                 </td><td>int                 </td><td>real               </td><td>real               </td><td>real                         </td><td>real                       </td><td>int                        </td><td>int                      </td><td>int              </td><td>real               </td><td>real              </td><td>int              </td><td>enum                         </td><td>int               </td><td>enum                         </td><td>int               </td><td>enum                         </td><td>int               </td><td>enum                         </td><td>int             </td><td>enum                         </td><td>int             </td><td>enum                         </td><td>int                                   </td><td>int                       </td><td>enum                                    </td><td>int                       </td><td>real              </td><td>real              </td></tr>\n",
       "<tr><td>mins   </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>0.0               </td><td>2013.0            </td><td>1.0              </td><td>1.0              </td><td>0.0               </td><td>0.0             </td><td>61.0               </td><td>0.0                   </td><td>0.0                     </td><td>0.25                    </td><td>0.5                     </td><td>NaN                     </td><td>NaN                     </td><td>NaN                     </td><td>60.96               </td><td>213.36              </td><td>365.76              </td><td>NaN                 </td><td>NaN                 </td><td>NaN                 </td><td>-26.700000000000003</td><td>0.12510000000000002</td><td>0.0                          </td><td>983.2949000000001          </td><td>NaN                        </td><td>NaN                      </td><td>NaN              </td><td>-15.600000000000001</td><td>0.001             </td><td>1.0              </td><td>                             </td><td>1.0               </td><td>                             </td><td>1.0               </td><td>                             </td><td>1.0               </td><td>                             </td><td>1.0             </td><td>                             </td><td>3.0             </td><td>                             </td><td>0.0                                   </td><td>1.0                       </td><td>                                        </td><td>10.0                      </td><td>7.2               </td><td>0.0               </td></tr>\n",
       "<tr><td>mean   </td><td>2013.5           </td><td>6.5260273972602745</td><td>15.72054794520548</td><td>11.500000000000004</td><td>2013.5005707762557</td><td>6.525114155251141</td><td>15.72134703196347</td><td>11.500114155251142</td><td>0.0             </td><td>1306.3119584569736 </td><td>0.4167424905220181    </td><td>0.3612073490813649      </td><td>0.8724453840732911      </td><td>0.9630456852791879      </td><td>0.0                     </td><td>0.0                     </td><td>0.0                     </td><td>1293.9822681953192  </td><td>1643.7390016566796  </td><td>2084.8938637563456  </td><td>0.0                 </td><td>0.0                 </td><td>0.0                 </td><td>4.313046467655992  </td><td>0.5967363891594567 </td><td>1.3799301075268817           </td><td>1017.8258144055944         </td><td>0.0                        </td><td>0.0                      </td><td>0.0              </td><td>12.578909070073914 </td><td>14.391442968202009</td><td>4.84251968503937 </td><td>                             </td><td>3.6586768935762226</td><td>                             </td><td>2.8466076696165192</td><td>                             </td><td>2.0114942528735633</td><td>                             </td><td>4.125           </td><td>                             </td><td>3.0             </td><td>                             </td><td>1.3784817351598173                    </td><td>4.84251968503937          </td><td>                                        </td><td>194.69525681985743        </td><td>9.422169480726348 </td><td>2.4103288784874057</td></tr>\n",
       "<tr><td>maxs   </td><td>2014.0           </td><td>12.0              </td><td>31.0             </td><td>23.0              </td><td>2015.0            </td><td>12.0             </td><td>31.0             </td><td>23.0              </td><td>0.0             </td><td>3657.6000000000004 </td><td>1.0                   </td><td>1.0                     </td><td>1.0                     </td><td>1.0                     </td><td>NaN                     </td><td>NaN                     </td><td>NaN                     </td><td>3657.5999           </td><td>3657.5999           </td><td>3657.5999           </td><td>NaN                 </td><td>NaN                 </td><td>NaN                 </td><td>24.400000000000002 </td><td>1.0                </td><td>26.924                       </td><td>1042.2113                  </td><td>NaN                        </td><td>NaN                      </td><td>NaN              </td><td>36.1               </td><td>16.0934           </td><td>60.0             </td><td>                             </td><td>60.0              </td><td>                             </td><td>36.0              </td><td>                             </td><td>27.0              </td><td>                             </td><td>27.0            </td><td>                             </td><td>3.0             </td><td>                             </td><td>16.0                                  </td><td>60.0                      </td><td>                                        </td><td>360.0                     </td><td>20.580000000000002</td><td>10.8              </td></tr>\n",
       "<tr><td>sigma  </td><td>0.500014270017262</td><td>3.447949723847773 </td><td>8.796498048523272</td><td>6.922384111875021 </td><td>0.50058441171579  </td><td>3.447824054577647</td><td>8.795614888684717</td><td>6.922301652025526 </td><td>0.0             </td><td>995.3398569657211  </td><td>0.4627208309925301    </td><td>0.42770569708047684     </td><td>0.19715569036704708     </td><td>0.08610155981044185     </td><td>-0.0                    </td><td>-0.0                    </td><td>-0.0                    </td><td>962.7430958537232   </td><td>916.7386134899587   </td><td>887.2158475113932   </td><td>-0.0                </td><td>-0.0                </td><td>-0.0                </td><td>10.973128209713666 </td><td>0.18579201186573496</td><td>2.5621512917896463           </td><td>7.464516971789659          </td><td>-0.0                       </td><td>-0.0                     </td><td>-0.0             </td><td>10.039673953091574 </td><td>3.6989362303340494</td><td>5.704865769828319</td><td>                             </td><td>6.133862539123368 </td><td>                             </td><td>5.805532863642112 </td><td>                             </td><td>3.1234084426128437</td><td>                             </td><td>6.15223536610881</td><td>                             </td><td>0.0             </td><td>                             </td><td>4.073860627017756                     </td><td>5.704865769828319         </td><td>                                        </td><td>106.3500000314393         </td><td>1.8151187111524154</td><td>1.614697905241178 </td></tr>\n",
       "<tr><td>zeros  </td><td>0                </td><td>0                 </td><td>0                </td><td>730               </td><td>0                 </td><td>0                </td><td>0                </td><td>730               </td><td>17455           </td><td>0                  </td><td>8758                  </td><td>8758                    </td><td>0                       </td><td>0                       </td><td>0                       </td><td>0                       </td><td>0                       </td><td>0                   </td><td>0                   </td><td>0                   </td><td>0                   </td><td>0                   </td><td>0                   </td><td>268                </td><td>0                  </td><td>501                          </td><td>0                          </td><td>0                          </td><td>0                        </td><td>0                </td><td>269                </td><td>0                 </td><td>0                </td><td>                             </td><td>0                 </td><td>                             </td><td>0                 </td><td>                             </td><td>0                 </td><td>                             </td><td>0               </td><td>                             </td><td>0               </td><td>                             </td><td>14980                                 </td><td>0                         </td><td>                                        </td><td>0                         </td><td>0                 </td><td>2768              </td></tr>\n",
       "<tr><td>missing</td><td>0                </td><td>0                 </td><td>0                </td><td>0                 </td><td>0                 </td><td>0                </td><td>0                </td><td>0                 </td><td>65              </td><td>10780              </td><td>375                   </td><td>375                     </td><td>14682                   </td><td>16535                   </td><td>17520                   </td><td>17520                   </td><td>17520                   </td><td>9103                </td><td>14683               </td><td>16535               </td><td>17520               </td><td>17520               </td><td>17520               </td><td>67                 </td><td>67                 </td><td>15660                        </td><td>360                        </td><td>17520                      </td><td>17520                    </td><td>17520            </td><td>67                 </td><td>412               </td><td>14980            </td><td>14980                        </td><td>16477             </td><td>16477                        </td><td>17181             </td><td>17181                        </td><td>17433             </td><td>17433                        </td><td>17504           </td><td>17504                        </td><td>17518           </td><td>17518                        </td><td>0                                     </td><td>14980                     </td><td>14980                                   </td><td>9382                      </td><td>14381             </td><td>1283              </td></tr>\n",
       "<tr><td>0      </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>0.0               </td><td>2013.0            </td><td>1.0              </td><td>1.0              </td><td>5.0               </td><td>0.0             </td><td>2895.6000000000004 </td><td>1.0                   </td><td>0.9                     </td><td>1.0                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>2895.5999           </td><td>3352.8              </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>-5.0               </td><td>0.5447000000000001 </td><td>nan                          </td><td>1013.0917000000001         </td><td>nan                        </td><td>nan                      </td><td>nan              </td><td>3.3000000000000003 </td><td>16.0934           </td><td>nan              </td><td>                             </td><td>nan               </td><td>                             </td><td>nan               </td><td>                             </td><td>nan               </td><td>                             </td><td>nan             </td><td>                             </td><td>nan             </td><td>                             </td><td>0.0                                   </td><td>nan                       </td><td>                                        </td><td>nan                       </td><td>nan               </td><td>2.57              </td></tr>\n",
       "<tr><td>1      </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>1.0               </td><td>2013.0            </td><td>1.0              </td><td>1.0              </td><td>6.0               </td><td>0.0             </td><td>3048.0             </td><td>1.0                   </td><td>1.0                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>3048.0              </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>-4.4               </td><td>0.5463             </td><td>nan                          </td><td>1012.0759                  </td><td>nan                        </td><td>nan                      </td><td>nan              </td><td>3.9000000000000004 </td><td>16.0934           </td><td>nan              </td><td>                             </td><td>nan               </td><td>                             </td><td>nan               </td><td>                             </td><td>nan               </td><td>                             </td><td>nan             </td><td>                             </td><td>nan             </td><td>                             </td><td>0.0                                   </td><td>nan                       </td><td>                                        </td><td>260.0                     </td><td>9.77              </td><td>4.63              </td></tr>\n",
       "<tr><td>2      </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>2.0               </td><td>2013.0            </td><td>1.0              </td><td>1.0              </td><td>7.0               </td><td>0.0             </td><td>1828.8000000000002 </td><td>1.0                   </td><td>1.0                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>1828.7999           </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>-3.3000000000000003</td><td>0.619              </td><td>nan                          </td><td>1012.4145000000001         </td><td>nan                        </td><td>nan                      </td><td>nan              </td><td>3.3000000000000003 </td><td>16.0934           </td><td>nan              </td><td>                             </td><td>nan               </td><td>                             </td><td>nan               </td><td>                             </td><td>nan               </td><td>                             </td><td>nan             </td><td>                             </td><td>nan             </td><td>                             </td><td>0.0                                   </td><td>nan                       </td><td>                                        </td><td>nan                       </td><td>7.72              </td><td>1.54              </td></tr>\n",
       "<tr><td>3      </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>3.0               </td><td>2013.0            </td><td>1.0              </td><td>1.0              </td><td>8.0               </td><td>0.0             </td><td>1463.0             </td><td>1.0                   </td><td>1.0                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>1463.04             </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>-2.8000000000000003</td><td>0.6159             </td><td>nan                          </td><td>1012.4145000000001         </td><td>nan                        </td><td>nan                      </td><td>nan              </td><td>3.9000000000000004 </td><td>16.0934           </td><td>nan              </td><td>                             </td><td>nan               </td><td>                             </td><td>nan               </td><td>                             </td><td>nan               </td><td>                             </td><td>nan             </td><td>                             </td><td>nan             </td><td>                             </td><td>0.0                                   </td><td>nan                       </td><td>                                        </td><td>nan                       </td><td>nan               </td><td>3.09              </td></tr>\n",
       "<tr><td>4      </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>4.0               </td><td>2013.0            </td><td>1.0              </td><td>1.0              </td><td>9.0               </td><td>0.0             </td><td>1402.1000000000001 </td><td>1.0                   </td><td>1.0                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>1402.0800000000002  </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>-2.8000000000000003</td><td>0.6159             </td><td>nan                          </td><td>1012.7531                  </td><td>nan                        </td><td>nan                      </td><td>nan              </td><td>3.9000000000000004 </td><td>16.0934           </td><td>nan              </td><td>                             </td><td>nan               </td><td>                             </td><td>nan               </td><td>                             </td><td>nan               </td><td>                             </td><td>nan             </td><td>                             </td><td>nan             </td><td>                             </td><td>0.0                                   </td><td>nan                       </td><td>                                        </td><td>260.0                     </td><td>nan               </td><td>4.12              </td></tr>\n",
       "<tr><td>5      </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>5.0               </td><td>2013.0            </td><td>1.0              </td><td>1.0              </td><td>10.0              </td><td>0.0             </td><td>1524.0             </td><td>1.0                   </td><td>1.0                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>1524.0              </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>-2.8000000000000003</td><td>0.6159             </td><td>nan                          </td><td>1012.4145000000001         </td><td>nan                        </td><td>nan                      </td><td>nan              </td><td>3.9000000000000004 </td><td>16.0934           </td><td>nan              </td><td>                             </td><td>nan               </td><td>                             </td><td>nan               </td><td>                             </td><td>nan               </td><td>                             </td><td>nan             </td><td>                             </td><td>nan             </td><td>                             </td><td>0.0                                   </td><td>nan                       </td><td>                                        </td><td>nan                       </td><td>nan               </td><td>3.09              </td></tr>\n",
       "<tr><td>6      </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>6.0               </td><td>2013.0            </td><td>1.0              </td><td>1.0              </td><td>11.0              </td><td>0.0             </td><td>1524.0             </td><td>1.0                   </td><td>1.0                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>1524.0              </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>-3.3000000000000003</td><td>0.5934             </td><td>nan                          </td><td>1012.0759                  </td><td>nan                        </td><td>nan                      </td><td>nan              </td><td>3.9000000000000004 </td><td>16.0934           </td><td>nan              </td><td>                             </td><td>nan               </td><td>                             </td><td>nan               </td><td>                             </td><td>nan               </td><td>                             </td><td>nan             </td><td>                             </td><td>nan             </td><td>                             </td><td>0.0                                   </td><td>nan                       </td><td>                                        </td><td>nan                       </td><td>9.26              </td><td>3.09              </td></tr>\n",
       "<tr><td>7      </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>7.0               </td><td>2013.0            </td><td>1.0              </td><td>1.0              </td><td>12.0              </td><td>0.0             </td><td>1524.0             </td><td>1.0                   </td><td>1.0                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>1524.0              </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>-3.3000000000000003</td><td>0.5934             </td><td>nan                          </td><td>1012.4145000000001         </td><td>nan                        </td><td>nan                      </td><td>nan              </td><td>3.9000000000000004 </td><td>16.0934           </td><td>nan              </td><td>                             </td><td>nan               </td><td>                             </td><td>nan               </td><td>                             </td><td>nan               </td><td>                             </td><td>nan             </td><td>                             </td><td>nan             </td><td>                             </td><td>0.0                                   </td><td>nan                       </td><td>                                        </td><td>260.0                     </td><td>9.26              </td><td>4.63              </td></tr>\n",
       "<tr><td>8      </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>8.0               </td><td>2013.0            </td><td>1.0              </td><td>1.0              </td><td>13.0              </td><td>0.0             </td><td>1524.0             </td><td>1.0                   </td><td>1.0                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>1524.0              </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>-2.8000000000000003</td><td>0.6425000000000001 </td><td>nan                          </td><td>1012.4145000000001         </td><td>nan                        </td><td>nan                      </td><td>nan              </td><td>3.3000000000000003 </td><td>16.0934           </td><td>nan              </td><td>                             </td><td>nan               </td><td>                             </td><td>nan               </td><td>                             </td><td>nan               </td><td>                             </td><td>nan             </td><td>                             </td><td>nan             </td><td>                             </td><td>0.0                                   </td><td>nan                       </td><td>                                        </td><td>260.0                     </td><td>nan               </td><td>3.09              </td></tr>\n",
       "<tr><td>9      </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>9.0               </td><td>2013.0            </td><td>1.0              </td><td>1.0              </td><td>14.0              </td><td>0.0             </td><td>1524.0             </td><td>1.0                   </td><td>0.9                     </td><td>1.0                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>nan                     </td><td>1524.0              </td><td>3657.5999           </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>nan                 </td><td>-2.8000000000000003</td><td>0.6159             </td><td>nan                          </td><td>1012.4145000000001         </td><td>nan                        </td><td>nan                      </td><td>nan              </td><td>3.9000000000000004 </td><td>16.0934           </td><td>nan              </td><td>                             </td><td>nan               </td><td>                             </td><td>nan               </td><td>                             </td><td>nan               </td><td>                             </td><td>nan             </td><td>                             </td><td>nan             </td><td>                             </td><td>0.0                                   </td><td>nan                       </td><td>                                        </td><td>nan                       </td><td>9.26              </td><td>3.09              </td></tr>\n",
       "</tbody>\n",
       "</table>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# ----------\n",
    "# 5- Now lets add some weather\n",
    "# Load weather data\n",
    "wthr1 = h2o.import_file(path=[mylocate(\"bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv\"),\n",
    "                               mylocate(\"bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv\")])\n",
    "# Peek at the data\n",
    "wthr1.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Rows:17520\n",
      "Cols:9\n",
      "\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead>\n",
       "<tr><th>       </th><th>Year Local       </th><th>Month Local       </th><th>Day Local        </th><th>Hour Local        </th><th>Dew Point (C)      </th><th>Humidity Fraction  </th><th>Rain (mm)         </th><th>Temperature (C)    </th><th>WC1  </th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "<tr><td>type   </td><td>int              </td><td>int               </td><td>int              </td><td>int               </td><td>real               </td><td>real               </td><td>real              </td><td>real               </td><td>enum </td></tr>\n",
       "<tr><td>mins   </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>0.0               </td><td>-26.700000000000003</td><td>0.12510000000000002</td><td>0.0               </td><td>-15.600000000000001</td><td>     </td></tr>\n",
       "<tr><td>mean   </td><td>2013.5           </td><td>6.5260273972602745</td><td>15.72054794520548</td><td>11.500000000000004</td><td>4.313046467655992  </td><td>0.5967363891594567 </td><td>1.3799301075268817</td><td>12.578909070073914 </td><td>     </td></tr>\n",
       "<tr><td>maxs   </td><td>2014.0           </td><td>12.0              </td><td>31.0             </td><td>23.0              </td><td>24.400000000000002 </td><td>1.0                </td><td>26.924            </td><td>36.1               </td><td>     </td></tr>\n",
       "<tr><td>sigma  </td><td>0.500014270017262</td><td>3.447949723847773 </td><td>8.796498048523272</td><td>6.922384111875021 </td><td>10.973128209713666 </td><td>0.18579201186573496</td><td>2.5621512917896463</td><td>10.039673953091574 </td><td>     </td></tr>\n",
       "<tr><td>zeros  </td><td>0                </td><td>0                 </td><td>0                </td><td>730               </td><td>268                </td><td>0                  </td><td>501               </td><td>269                </td><td>     </td></tr>\n",
       "<tr><td>missing</td><td>0                </td><td>0                 </td><td>0                </td><td>0                 </td><td>67                 </td><td>67                 </td><td>15660             </td><td>67                 </td><td>14980</td></tr>\n",
       "<tr><td>0      </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>0.0               </td><td>-5.0               </td><td>0.5447000000000001 </td><td>nan               </td><td>3.3000000000000003 </td><td>     </td></tr>\n",
       "<tr><td>1      </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>1.0               </td><td>-4.4               </td><td>0.5463             </td><td>nan               </td><td>3.9000000000000004 </td><td>     </td></tr>\n",
       "<tr><td>2      </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>2.0               </td><td>-3.3000000000000003</td><td>0.619              </td><td>nan               </td><td>3.3000000000000003 </td><td>     </td></tr>\n",
       "<tr><td>3      </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>3.0               </td><td>-2.8000000000000003</td><td>0.6159             </td><td>nan               </td><td>3.9000000000000004 </td><td>     </td></tr>\n",
       "<tr><td>4      </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>4.0               </td><td>-2.8000000000000003</td><td>0.6159             </td><td>nan               </td><td>3.9000000000000004 </td><td>     </td></tr>\n",
       "<tr><td>5      </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>5.0               </td><td>-2.8000000000000003</td><td>0.6159             </td><td>nan               </td><td>3.9000000000000004 </td><td>     </td></tr>\n",
       "<tr><td>6      </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>6.0               </td><td>-3.3000000000000003</td><td>0.5934             </td><td>nan               </td><td>3.9000000000000004 </td><td>     </td></tr>\n",
       "<tr><td>7      </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>7.0               </td><td>-3.3000000000000003</td><td>0.5934             </td><td>nan               </td><td>3.9000000000000004 </td><td>     </td></tr>\n",
       "<tr><td>8      </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>8.0               </td><td>-2.8000000000000003</td><td>0.6425000000000001 </td><td>nan               </td><td>3.3000000000000003 </td><td>     </td></tr>\n",
       "<tr><td>9      </td><td>2013.0           </td><td>1.0               </td><td>1.0              </td><td>9.0               </td><td>-2.8000000000000003</td><td>0.6159             </td><td>nan               </td><td>3.9000000000000004 </td><td>     </td></tr>\n",
       "</tbody>\n",
       "</table>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Lots of columns in there!  Lets plan on converting to time-since-epoch to do\n",
    "# a 'join' with the bike data, plus gather weather info that might affect\n",
    "# cyclists - rain, snow, temperature.  Alas, drop the \"snow\" column since it's\n",
    "# all NA's.  Also add in dew point and humidity just in case.  Slice out just\n",
    "# the columns of interest and drop the rest.\n",
    "wthr2 = wthr1[[\"Year Local\",\"Month Local\",\"Day Local\",\"Hour Local\",\"Dew Point (C)\",\"Humidity Fraction\",\"Precipitation One Hour (mm)\",\"Temperature (C)\",\"Weather Code 1/ Description\"]]\n",
    "\n",
    "wthr2.set_name(wthr2.names.index(\"Precipitation One Hour (mm)\"), \"Rain (mm)\")\n",
    "wthr2.set_name(wthr2.names.index(\"Weather Code 1/ Description\"), \"WC1\")\n",
    "wthr2.describe()\n",
    "# Much better!  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Filter down to the weather at Noon\n",
    "wthr3 = wthr2[ wthr2[\"Hour Local\"]==12 ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Rows:730\n",
      "Cols:11\n",
      "\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead>\n",
       "<tr><th>       </th><th>Year Local        </th><th>Month Local      </th><th>Day Local        </th><th>Hour Local  </th><th>Dew Point (C)      </th><th>Humidity Fraction  </th><th>Rain (mm)         </th><th>Temperature (C)   </th><th>WC1  </th><th>msec              </th><th>Days              </th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "<tr><td>type   </td><td>int               </td><td>int              </td><td>int              </td><td>int         </td><td>real               </td><td>real               </td><td>real              </td><td>real              </td><td>enum </td><td>int               </td><td>int               </td></tr>\n",
       "<tr><td>mins   </td><td>2013.0            </td><td>1.0              </td><td>1.0              </td><td>12.0        </td><td>-26.700000000000003</td><td>0.1723             </td><td>0.0               </td><td>-13.9             </td><td>     </td><td>1357070400000.0   </td><td>15706.0           </td></tr>\n",
       "<tr><td>mean   </td><td>2013.5            </td><td>6.526027397260274</td><td>15.72054794520548</td><td>12.0        </td><td>4.230123796423659  </td><td>0.539728198074278  </td><td>1.5312571428571429</td><td>14.068775790921595</td><td>     </td><td>1388560852602.7397</td><td>16070.5           </td></tr>\n",
       "<tr><td>maxs   </td><td>2014.0            </td><td>12.0             </td><td>31.0             </td><td>12.0        </td><td>23.3               </td><td>1.0                </td><td>12.446            </td><td>34.4              </td><td>     </td><td>1420056000000.0   </td><td>16435.0           </td></tr>\n",
       "<tr><td>sigma  </td><td>0.5003428180039172</td><td>3.450215293068149</td><td>8.802278027009615</td><td>0.0         </td><td>11.106296472475226 </td><td>0.17994502792324327</td><td>2.3606424861490587</td><td>10.398985514891212</td><td>     </td><td>18219740080.410755</td><td>210.87713642466474</td></tr>\n",
       "<tr><td>zeros  </td><td>0                 </td><td>0                </td><td>0                </td><td>0           </td><td>14                 </td><td>0                  </td><td>15                </td><td>7                 </td><td>     </td><td>0                 </td><td>0                 </td></tr>\n",
       "<tr><td>missing</td><td>0                 </td><td>0                </td><td>0                </td><td>0           </td><td>3                  </td><td>3                  </td><td>660               </td><td>3                 </td><td>620  </td><td>0                 </td><td>0                 </td></tr>\n",
       "<tr><td>0      </td><td>2013.0            </td><td>1.0              </td><td>1.0              </td><td>12.0        </td><td>-3.3000000000000003</td><td>0.5934             </td><td>nan               </td><td>3.9000000000000004</td><td>     </td><td>1357070400000.0   </td><td>15706.0           </td></tr>\n",
       "<tr><td>1      </td><td>2013.0            </td><td>1.0              </td><td>2.0              </td><td>12.0        </td><td>-11.700000000000001</td><td>0.4806             </td><td>nan               </td><td>-2.2              </td><td>     </td><td>1357156800000.0   </td><td>15707.0           </td></tr>\n",
       "<tr><td>2      </td><td>2013.0            </td><td>1.0              </td><td>3.0              </td><td>12.0        </td><td>-10.600000000000001</td><td>0.5248             </td><td>nan               </td><td>-2.2              </td><td>     </td><td>1357243200000.0   </td><td>15708.0           </td></tr>\n",
       "<tr><td>3      </td><td>2013.0            </td><td>1.0              </td><td>4.0              </td><td>12.0        </td><td>-7.2               </td><td>0.49760000000000004</td><td>nan               </td><td>2.2               </td><td>     </td><td>1357329600000.0   </td><td>15709.0           </td></tr>\n",
       "<tr><td>4      </td><td>2013.0            </td><td>1.0              </td><td>5.0              </td><td>12.0        </td><td>-7.2               </td><td>0.42600000000000005</td><td>nan               </td><td>4.4               </td><td>     </td><td>1357416000000.0   </td><td>15710.0           </td></tr>\n",
       "<tr><td>5      </td><td>2013.0            </td><td>1.0              </td><td>6.0              </td><td>12.0        </td><td>-1.7000000000000002</td><td>0.6451             </td><td>nan               </td><td>4.4               </td><td>haze </td><td>1357502400000.0   </td><td>15711.0           </td></tr>\n",
       "<tr><td>6      </td><td>2013.0            </td><td>1.0              </td><td>7.0              </td><td>12.0        </td><td>-6.1000000000000005</td><td>0.41190000000000004</td><td>nan               </td><td>6.1000000000000005</td><td>     </td><td>1357588800000.0   </td><td>15712.0           </td></tr>\n",
       "<tr><td>7      </td><td>2013.0            </td><td>1.0              </td><td>8.0              </td><td>12.0        </td><td>-1.7000000000000002</td><td>0.5314             </td><td>nan               </td><td>7.2               </td><td>     </td><td>1357675200000.0   </td><td>15713.0           </td></tr>\n",
       "<tr><td>8      </td><td>2013.0            </td><td>1.0              </td><td>9.0              </td><td>12.0        </td><td>0.6000000000000001 </td><td>0.56               </td><td>nan               </td><td>8.9               </td><td>haze </td><td>1357761600000.0   </td><td>15714.0           </td></tr>\n",
       "<tr><td>9      </td><td>2013.0            </td><td>1.0              </td><td>10.0             </td><td>12.0        </td><td>-6.1000000000000005</td><td>0.3952             </td><td>nan               </td><td>6.7               </td><td>     </td><td>1357848000000.0   </td><td>15715.0           </td></tr>\n",
       "</tbody>\n",
       "</table>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Lets now get Days since the epoch... we'll convert year/month/day into Epoch\n",
    "# time, and then back to Epoch days.  Need zero-based month and days, but have\n",
    "# 1-based.\n",
    "wthr3[\"msec\"] = h2o.H2OFrame.mktime(year=wthr3[\"Year Local\"], month=wthr3[\"Month Local\"]-1, day=wthr3[\"Day Local\"]-1, hour=wthr3[\"Hour Local\"])\n",
    "secsPerDay=1000*60*60*24\n",
    "wthr3[\"Days\"] = (wthr3[\"msec\"]/secsPerDay).floor()\n",
    "wthr3.describe()\n",
    "# msec looks sane (numbers like 1.3e12 are in the correct range for msec since\n",
    "# 1970).  Epoch Days matches closely with the epoch day numbers from the\n",
    "# CitiBike dataset.  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Lets drop off the extra time columns to make a easy-to-handle dataset.\n",
    "wthr4 = wthr3.drop(\"Year Local\").drop(\"Month Local\").drop(\"Day Local\").drop(\"Hour Local\").drop(\"msec\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Also, most rain numbers are missing - lets assume those are zero rain days\n",
    "rain = wthr4[\"Rain (mm)\"]\n",
    "rain[ rain.isna() ] = 0\n",
    "wthr4[\"Rain (mm)\"] = rain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Merge Daily Weather with Bikes-Per-Day\n",
      "Rows:10131\n",
      "Cols:10\n",
      "\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead>\n",
       "<tr><th>       </th><th>Days              </th><th>start station name     </th><th>bikes             </th><th>Month  </th><th>DayOfWeek  </th><th>Dew Point (C)     </th><th>Humidity Fraction  </th><th>Rain (mm)           </th><th>Temperature (C)   </th><th>WC1  </th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "<tr><td>type   </td><td>int               </td><td>enum                   </td><td>int               </td><td>enum   </td><td>enum       </td><td>real              </td><td>real               </td><td>real                </td><td>real              </td><td>enum </td></tr>\n",
       "<tr><td>mins   </td><td>15979.0           </td><td>                       </td><td>1.0               </td><td>       </td><td>           </td><td>-2.2              </td><td>0.34850000000000003</td><td>0.0                 </td><td>9.4               </td><td>     </td></tr>\n",
       "<tr><td>mean   </td><td>15993.953311617806</td><td>                       </td><td>102.42937518507551</td><td>       </td><td>           </td><td>7.60732405488106  </td><td>0.5564958839206396 </td><td>0.008198400947586611</td><td>16.937094067712962</td><td>     </td></tr>\n",
       "<tr><td>maxs   </td><td>16009.0           </td><td>                       </td><td>603.0             </td><td>       </td><td>           </td><td>19.400000000000002</td><td>0.8718             </td><td>0.254               </td><td>26.1              </td><td>     </td></tr>\n",
       "<tr><td>sigma  </td><td>8.950698111468864 </td><td>                       </td><td>74.05933443246006 </td><td>       </td><td>           </td><td>6.516386487040385 </td><td>0.14811201086649933</td><td>0.04489297266255909 </td><td>4.362687300129602 </td><td>     </td></tr>\n",
       "<tr><td>zeros  </td><td>0                 </td><td>                       </td><td>0                 </td><td>       </td><td>           </td><td>0                 </td><td>0                  </td><td>9804                </td><td>0                 </td><td>     </td></tr>\n",
       "<tr><td>missing</td><td>0                 </td><td>0                      </td><td>0                 </td><td>0      </td><td>0          </td><td>0                 </td><td>0                  </td><td>0                   </td><td>0                 </td><td>8816 </td></tr>\n",
       "<tr><td>0      </td><td>15979.0           </td><td>1 Ave & E 15 St        </td><td>173.0             </td><td>10     </td><td>Tue        </td><td>10.600000000000001</td><td>0.4315             </td><td>0.0                 </td><td>23.900000000000002</td><td>     </td></tr>\n",
       "<tr><td>1      </td><td>15979.0           </td><td>1 Ave & E 18 St        </td><td>118.0             </td><td>10     </td><td>Tue        </td><td>10.600000000000001</td><td>0.4315             </td><td>0.0                 </td><td>23.900000000000002</td><td>     </td></tr>\n",
       "<tr><td>2      </td><td>15979.0           </td><td>1 Ave & E 30 St        </td><td>152.0             </td><td>10     </td><td>Tue        </td><td>10.600000000000001</td><td>0.4315             </td><td>0.0                 </td><td>23.900000000000002</td><td>     </td></tr>\n",
       "<tr><td>3      </td><td>15979.0           </td><td>10 Ave & W 28 St       </td><td>115.0             </td><td>10     </td><td>Tue        </td><td>10.600000000000001</td><td>0.4315             </td><td>0.0                 </td><td>23.900000000000002</td><td>     </td></tr>\n",
       "<tr><td>4      </td><td>15979.0           </td><td>11 Ave & W 27 St       </td><td>210.0             </td><td>10     </td><td>Tue        </td><td>10.600000000000001</td><td>0.4315             </td><td>0.0                 </td><td>23.900000000000002</td><td>     </td></tr>\n",
       "<tr><td>5      </td><td>15979.0           </td><td>11 Ave & W 41 St       </td><td>106.0             </td><td>10     </td><td>Tue        </td><td>10.600000000000001</td><td>0.4315             </td><td>0.0                 </td><td>23.900000000000002</td><td>     </td></tr>\n",
       "<tr><td>6      </td><td>15979.0           </td><td>12 Ave & W 40 St       </td><td>144.0             </td><td>10     </td><td>Tue        </td><td>10.600000000000001</td><td>0.4315             </td><td>0.0                 </td><td>23.900000000000002</td><td>     </td></tr>\n",
       "<tr><td>7      </td><td>15979.0           </td><td>2 Ave & E 31 St        </td><td>206.0             </td><td>10     </td><td>Tue        </td><td>10.600000000000001</td><td>0.4315             </td><td>0.0                 </td><td>23.900000000000002</td><td>     </td></tr>\n",
       "<tr><td>8      </td><td>15979.0           </td><td>2 Ave & E 58 St        </td><td>105.0             </td><td>10     </td><td>Tue        </td><td>10.600000000000001</td><td>0.4315             </td><td>0.0                 </td><td>23.900000000000002</td><td>     </td></tr>\n",
       "<tr><td>9      </td><td>15979.0           </td><td>3 Ave & Schermerhorn St</td><td>15.0              </td><td>10     </td><td>Tue        </td><td>10.600000000000001</td><td>0.4315             </td><td>0.0                 </td><td>23.900000000000002</td><td>     </td></tr>\n",
       "</tbody>\n",
       "</table>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<table>\n",
       "<thead>\n",
       "<tr><th style=\"text-align: right;\">  Days</th><th>start station name     </th><th style=\"text-align: right;\">  bikes</th><th style=\"text-align: right;\">  Month</th><th>DayOfWeek  </th><th style=\"text-align: right;\">  Dew Point (C)</th><th style=\"text-align: right;\">  Humidity Fraction</th><th style=\"text-align: right;\">  Rain (mm)</th><th style=\"text-align: right;\">  Temperature (C)</th><th>WC1  </th></tr>\n",
       "</thead>\n",
       "<tbody>\n",
       "<tr><td style=\"text-align: right;\"> 15979</td><td>1 Ave & E 15 St        </td><td style=\"text-align: right;\">    173</td><td style=\"text-align: right;\">     10</td><td>Tue        </td><td style=\"text-align: right;\">           10.6</td><td style=\"text-align: right;\">             0.4315</td><td style=\"text-align: right;\">          0</td><td style=\"text-align: right;\">             23.9</td><td>     </td></tr>\n",
       "<tr><td style=\"text-align: right;\"> 15979</td><td>1 Ave & E 18 St        </td><td style=\"text-align: right;\">    118</td><td style=\"text-align: right;\">     10</td><td>Tue        </td><td style=\"text-align: right;\">           10.6</td><td style=\"text-align: right;\">             0.4315</td><td style=\"text-align: right;\">          0</td><td style=\"text-align: right;\">             23.9</td><td>     </td></tr>\n",
       "<tr><td style=\"text-align: right;\"> 15979</td><td>1 Ave & E 30 St        </td><td style=\"text-align: right;\">    152</td><td style=\"text-align: right;\">     10</td><td>Tue        </td><td style=\"text-align: right;\">           10.6</td><td style=\"text-align: right;\">             0.4315</td><td style=\"text-align: right;\">          0</td><td style=\"text-align: right;\">             23.9</td><td>     </td></tr>\n",
       "<tr><td style=\"text-align: right;\"> 15979</td><td>10 Ave & W 28 St       </td><td style=\"text-align: right;\">    115</td><td style=\"text-align: right;\">     10</td><td>Tue        </td><td style=\"text-align: right;\">           10.6</td><td style=\"text-align: right;\">             0.4315</td><td style=\"text-align: right;\">          0</td><td style=\"text-align: right;\">             23.9</td><td>     </td></tr>\n",
       "<tr><td style=\"text-align: right;\"> 15979</td><td>11 Ave & W 27 St       </td><td style=\"text-align: right;\">    210</td><td style=\"text-align: right;\">     10</td><td>Tue        </td><td style=\"text-align: right;\">           10.6</td><td style=\"text-align: right;\">             0.4315</td><td style=\"text-align: right;\">          0</td><td style=\"text-align: right;\">             23.9</td><td>     </td></tr>\n",
       "<tr><td style=\"text-align: right;\"> 15979</td><td>11 Ave & W 41 St       </td><td style=\"text-align: right;\">    106</td><td style=\"text-align: right;\">     10</td><td>Tue        </td><td style=\"text-align: right;\">           10.6</td><td style=\"text-align: right;\">             0.4315</td><td style=\"text-align: right;\">          0</td><td style=\"text-align: right;\">             23.9</td><td>     </td></tr>\n",
       "<tr><td style=\"text-align: right;\"> 15979</td><td>12 Ave & W 40 St       </td><td style=\"text-align: right;\">    144</td><td style=\"text-align: right;\">     10</td><td>Tue        </td><td style=\"text-align: right;\">           10.6</td><td style=\"text-align: right;\">             0.4315</td><td style=\"text-align: right;\">          0</td><td style=\"text-align: right;\">             23.9</td><td>     </td></tr>\n",
       "<tr><td style=\"text-align: right;\"> 15979</td><td>2 Ave & E 31 St        </td><td style=\"text-align: right;\">    206</td><td style=\"text-align: right;\">     10</td><td>Tue        </td><td style=\"text-align: right;\">           10.6</td><td style=\"text-align: right;\">             0.4315</td><td style=\"text-align: right;\">          0</td><td style=\"text-align: right;\">             23.9</td><td>     </td></tr>\n",
       "<tr><td style=\"text-align: right;\"> 15979</td><td>2 Ave & E 58 St        </td><td style=\"text-align: right;\">    105</td><td style=\"text-align: right;\">     10</td><td>Tue        </td><td style=\"text-align: right;\">           10.6</td><td style=\"text-align: right;\">             0.4315</td><td style=\"text-align: right;\">          0</td><td style=\"text-align: right;\">             23.9</td><td>     </td></tr>\n",
       "<tr><td style=\"text-align: right;\"> 15979</td><td>3 Ave & Schermerhorn St</td><td style=\"text-align: right;\">     15</td><td style=\"text-align: right;\">     10</td><td>Tue        </td><td style=\"text-align: right;\">           10.6</td><td style=\"text-align: right;\">             0.4315</td><td style=\"text-align: right;\">          0</td><td style=\"text-align: right;\">             23.9</td><td>     </td></tr>\n",
       "</tbody>\n",
       "</table>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# ----------\n",
    "# 6 - Join the weather data-per-day to the bike-starts-per-day\n",
    "print(\"Merge Daily Weather with Bikes-Per-Day\")\n",
    "bpd_with_weather = bpd.merge(wthr4,all_x=True,all_y=False)\n",
    "bpd_with_weather.describe()\n",
    "bpd_with_weather.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training data has 10 columns and 6066 rows, test has 3044 rows, holdout has 1021\n",
      "gbm Model Build progress: |███████████████████████████████████████████████| 100%\n",
      "drf Model Build progress: |███████████████████████████████████████████████| 100%\n",
      "glm Model Build progress: |███████████████████████████████████████████████| 100%\n",
      "deeplearning Model Build progress: |██████████████████████████████████████| 100%\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td><b>Model</b></td>\n",
       "<td><b>mse TRAIN</b></td>\n",
       "<td><b>mse TEST</b></td>\n",
       "<td><b>mse HOLDOUT</b></td>\n",
       "<td><b>Model Training Time (s)</b></td></tr>\n",
       "<tr><td>GBM</td>\n",
       "<td>0.2159977</td>\n",
       "<td>393.0248269</td>\n",
       "<td>404.2520310</td>\n",
       "<td>8.679</td></tr>\n",
       "<tr><td>DRF</td>\n",
       "<td>804.2152039</td>\n",
       "<td>1703.1540562</td>\n",
       "<td>1782.0854925</td>\n",
       "<td>6.573</td></tr>\n",
       "<tr><td>GLM</td>\n",
       "<td>620.8814844</td>\n",
       "<td>735.9622856</td>\n",
       "<td>789.7891737</td>\n",
       "<td>0.241</td></tr>\n",
       "<tr><td>DL </td>\n",
       "<td>213.8582644</td>\n",
       "<td>454.7871732</td>\n",
       "<td>476.5995571</td>\n",
       "<td>7.518</td></tr></table></div>"
      ],
      "text/plain": [
       "Model      mse TRAIN    mse TEST    mse HOLDOUT    Model Training Time (s)\n",
       "-------  -----------  ----------  -------------  -------------------------\n",
       "GBM         0.215998     393.025        404.252                      8.679\n",
       "DRF       804.215       1703.15        1782.09                       6.573\n",
       "GLM       620.881        735.962        789.789                      0.241\n",
       "DL        213.858        454.787        476.6                        7.518"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 7 - Test/Train split again, model build again, this time with weather\n",
    "split_fit_predict(bpd_with_weather)"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}