{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import h2o\n", "import time\n", "from h2o.estimators.glm import H2OGeneralizedLinearEstimator\n", "from h2o.estimators.gbm import H2OGradientBoostingEstimator\n", "from h2o.estimators.random_forest import H2ORandomForestEstimator\n", "from h2o.estimators.deeplearning import H2ODeepLearningEstimator" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Checking whether there is an H2O instance running at http://localhost:54321. connected.\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
H2O cluster uptime:2 mins 11 secs
H2O cluster version:3.11.0.99999
H2O cluster version age:3 minutes
H2O cluster name:pasha
H2O cluster total nodes:1
H2O cluster free memory:3.399 Gb
H2O cluster total cores:8
H2O cluster allowed cores:8
H2O cluster status:locked, healthy
H2O connection url:http://localhost:54321
H2O connection proxy:None
Python version:3.5.2 final
" ], "text/plain": [ "-------------------------- ----------------------\n", "H2O cluster uptime: 2 mins 11 secs\n", "H2O cluster version: 3.11.0.99999\n", "H2O cluster version age: 3 minutes\n", "H2O cluster name: pasha\n", "H2O cluster total nodes: 1\n", "H2O cluster free memory: 3.399 Gb\n", "H2O cluster total cores: 8\n", "H2O cluster allowed cores: 8\n", "H2O cluster status: locked, healthy\n", "H2O connection url: http://localhost:54321\n", "H2O connection proxy:\n", "Python version: 3.5.2 final\n", "-------------------------- ----------------------" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Explore a typical Data Science workflow with H2O and Python\n", "#\n", "# Goal: assist the manager of CitiBike of NYC to load-balance the bicycles\n", "# across the CitiBike network of stations, by predicting the number of bike\n", "# trips taken from the station every day. Use 10 million rows of historical\n", "# data, and eventually add weather data.\n", "\n", "\n", "# Connect to a cluster\n", "h2o.init()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.\n", "\n", "# Set this to True if you want to fetch the data directly from S3.\n", "# This is useful if your cluster is running in EC2.\n", "data_source_is_s3 = False\n", "\n", "def mylocate(s):\n", " if data_source_is_s3:\n", " return \"s3n://h2o-public-test-data/\" + s\n", " else:\n", " return _locate(s)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Import and Parse bike data\n", "Parse progress: |█████████████████████████████████████████████████████████| 100%\n" ] } ], "source": [ "# Pick either the big or the small demo.\n", "# Big data is 10M rows\n", "small_test = [mylocate(\"bigdata/laptop/citibike-nyc/2013-10.csv\")]\n", "big_test = [mylocate(\"bigdata/laptop/citibike-nyc/2013-07.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-08.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-09.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-10.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-11.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-12.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-01.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-02.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-03.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-04.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-05.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-06.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-07.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-08.csv\")]\n", "\n", "# ----------\n", "\n", "# 1- Load data - 1 row per bicycle trip. Has columns showing the start and end\n", "# station, trip duration and trip start time and day. The larger dataset\n", "# totals about 10 million rows\n", "print(\"Import and Parse bike data\")\n", "data = h2o.import_file(path=big_test)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows:10407546\n", "Cols:16\n", "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
tripduration starttime stoptime start station id start station name start station latitude start station longitude end station id end station name end station latitude end station longitude bikeid usertype birth year gender Days
type int time time int enum real real int enum real real int enum int int int
mins 60.0 1372636800000.0 1372637042000.0 72.0 40.680342423 -74.01713445 72.0 40.680342423 -74.01713445 14529.0 1899.0 0.0 15887.0
mean 868.96872605703651390974078989.41 1390974947925.281 444.85883540654095 40.734381982315185 -73.99105701820217 445.2597855440662 40.73408688953741 -73.99117077985979 17895.66183584484 1975.79892394045461.084146541365275816098.629260922817
maxs 6250750.0 1409529587000.0 1409538405000.0 3002.0 40.771522000000004 -73.9500479759 3002.0 40.771522000000004 -73.9500479759 21689.0 1998.0 2.0 16313.0
sigma 2985.10540532014511806736501.937712 11806714056.539324 355.7559897645294 0.01971005087361252 0.012345332018503775 360.0703808439832 0.019730957863268683 0.012431186159808448 1938.8051788415307 11.1327849049867510.5630197777940005136.6534484381553
zeros 0 0 0 0 0 0 0 0 0 0 0 1248517 0
missing0 0 0 0 0 0 0 0 0 0 0 0 0 1247644 0 0
0 634.0 2013-07-01 00:00:002013-07-01 00:10:34164.0 E 47 St & 2 Ave 40.75323098 -73.97032517 504.0 1 Ave & E 15 St 40.732218530000004 -73.98165557 16950.0 Customer nan 0.0 15887.0
1 1547.0 2013-07-01 00:00:022013-07-01 00:25:49388.0 W 26 St & 10 Ave 40.749717753000006 -74.002950346 459.0 W 20 St & 11 Ave 40.746745 -74.007756 19816.0 Customer nan 0.0 15887.0
2 178.0 2013-07-01 00:01:042013-07-01 00:04:02293.0 Lafayette St & E 8 St 40.73028666 -73.9907647 237.0 E 11 St & 2 Ave 40.730473090000004 -73.98672378 14548.0 Subscriber1980.0 2.0 15887.0
3 1580.0 2013-07-01 00:01:062013-07-01 00:27:26531.0 Forsyth St & Broome St 40.71893904 -73.99266288 499.0 Broadway & W 60 St 40.76915505 -73.98191841 16063.0 Customer nan 0.0 15887.0
4 757.0 2013-07-01 00:01:102013-07-01 00:13:47382.0 University Pl & E 14 St40.73492695 -73.99200509 410.0 Suffolk St & Stanton St 40.72066442 -73.98517977 19213.0 Subscriber1986.0 1.0 15887.0
5 861.0 2013-07-01 00:01:232013-07-01 00:15:44511.0 E 14 St & Avenue B 40.72938685 -73.97772429 454.0 E 51 St & 1 Ave 40.75455731 -73.96592976000001 16223.0 Subscriber1988.0 1.0 15887.0
6 550.0 2013-07-01 00:01:592013-07-01 00:11:09293.0 Lafayette St & E 8 St 40.73028666 -73.9907647 394.0 E 9 St & Avenue C 40.72521311 -73.97768752 16746.0 Customer nan 0.0 15887.0
7 288.0 2013-07-01 00:02:162013-07-01 00:07:04224.0 Spruce St & Nassau St 40.71146364 -74.00552427 376.0 John St & William St 40.70862144 -74.00722156 16062.0 Subscriber1985.0 2.0 15887.0
8 766.0 2013-07-01 00:02:162013-07-01 00:15:02432.0 E 7 St & Avenue A 40.72621788 -73.98379855 336.0 Sullivan St & Washington Sq40.730477470000004 -73.99906065 17963.0 Subscriber1980.0 2.0 15887.0
9 773.0 2013-07-01 00:02:232013-07-01 00:15:16173.0 Broadway & W 49 St 40.76064679 -73.98442659 479.0 9 Ave & W 45 St 40.760192520000004 -73.9912551 19365.0 Subscriber1989.0 1.0 15887.0
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# ----------\n", "\n", "# 2- light data munging: group the bike starts per-day, converting the 10M rows\n", "# of trips to about 140,000 station&day combos - predicting the number of trip\n", "# starts per-station-per-day.\n", "\n", "# Convert start time to: Day since the Epoch\n", "startime = data[\"starttime\"]\n", "secsPerDay = 1000 * 3600 * 24\n", "data[\"Days\"] = (startime.asnumeric() / secsPerDay).floor()\n", "data.describe()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Daysstart station name bikes
158871 Ave & E 15 St 74
158871 Ave & E 18 St 51
158871 Ave & E 30 St 66
158871 Ave & E 44 St 56
1588710 Ave & W 28 St 51
1588711 Ave & W 27 St 65
1588711 Ave & W 41 St 53
1588712 Ave & W 40 St 36
158872 Ave & E 31 St 96
158872 Ave & E 58 St 103
" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Rows:138795\n", "Cols:3\n", "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Days start station name bikes
type int enum int
mins 15887.0 1.0
mean 16099.488166000216 74.98502107424619
maxs 16313.0 668.0
sigma 123.39632568805678 64.73186265524505
zeros 0 0
missing0 0 0
0 15887.0 1 Ave & E 15 St 74.0
1 15887.0 1 Ave & E 18 St 51.0
2 15887.0 1 Ave & E 30 St 66.0
3 15887.0 1 Ave & E 44 St 56.0
4 15887.0 10 Ave & W 28 St 51.0
5 15887.0 11 Ave & W 27 St 65.0
6 15887.0 11 Ave & W 41 St 53.0
7 15887.0 12 Ave & W 40 St 36.0
8 15887.0 2 Ave & E 31 St 96.0
9 15887.0 2 Ave & E 58 St 103.0
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[138795, 3]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Now do a monster Group-By. Count bike starts per-station per-day. Ends up\n", "# with about 340 stations times 400 days (140,000 rows). This is what we want\n", "# to predict.\n", "grouped = data.group_by([\"Days\",\"start station name\"])\n", "bpd = grouped.count().get_frame() # Compute bikes-per-day\n", "bpd.set_name(2,\"bikes\")\n", "bpd.show()\n", "bpd.describe()\n", "bpd.dim" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quantiles of bikes-per-day\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Probs bikesQuantiles
0.01 2
0.1 11
0.25 26
0.333 35
0.5 59
0.667 89
0.75 107
0.9 158
0.99 293
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Quantiles: the data is fairly unbalanced; some station/day combos are wildly\n", "# more popular than others.\n", "print(\"Quantiles of bikes-per-day\")\n", "bpd[\"bikes\"].quantile().show()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Bikes-Per-Day\n", "Rows:138795\n", "Cols:5\n", "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Days start station name bikes Month DayOfWeek
type int enum int enum enum
mins 15887.0 1.0
mean 16099.488166000216 74.98502107424619
maxs 16313.0 668.0
sigma 123.39632568805678 64.73186265524505
zeros 0 0
missing0 0 0 0 0
0 15887.0 1 Ave & E 15 St 74.0 7 Mon
1 15887.0 1 Ave & E 18 St 51.0 7 Mon
2 15887.0 1 Ave & E 30 St 66.0 7 Mon
3 15887.0 1 Ave & E 44 St 56.0 7 Mon
4 15887.0 10 Ave & W 28 St 51.0 7 Mon
5 15887.0 11 Ave & W 27 St 65.0 7 Mon
6 15887.0 11 Ave & W 41 St 53.0 7 Mon
7 15887.0 12 Ave & W 40 St 36.0 7 Mon
8 15887.0 2 Ave & E 31 St 96.0 7 Mon
9 15887.0 2 Ave & E 58 St 103.0 7 Mon
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# A little feature engineering\n", "# Add in month-of-year (seasonality; fewer bike rides in winter than summer)\n", "secs = bpd[\"Days\"]*secsPerDay\n", "bpd[\"Month\"] = secs.month().asfactor()\n", "# Add in day-of-week (work-week; more bike rides on Sunday than Monday)\n", "bpd[\"DayOfWeek\"] = secs.dayOfWeek()\n", "print(\"Bikes-Per-Day\")\n", "bpd.describe()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# ----------\n", "# 3- Fit a model on train; using test as validation\n", "\n", "# Function for doing class test/train/holdout split\n", "def split_fit_predict(data):\n", " global gbm0,drf0,glm0,dl0\n", " # Classic Test/Train split\n", " r = data['Days'].runif() # Random UNIForm numbers, one per row\n", " train = data[ r < 0.6]\n", " test = data[(0.6 <= r) & (r < 0.9)]\n", " hold = data[ 0.9 <= r ]\n", " print(\"Training data has\",train.ncol,\"columns and\",train.nrow,\"rows, test has\",test.nrow,\"rows, holdout has\",hold.nrow)\n", " bike_names_x = data.names\n", " bike_names_x.remove(\"bikes\")\n", " \n", " # Run GBM\n", " s = time.time()\n", " \n", " gbm0 = H2OGradientBoostingEstimator(ntrees=500, # 500 works well\n", " max_depth=6,\n", " learn_rate=0.1)\n", " \n", "\n", " gbm0.train(x =bike_names_x,\n", " y =\"bikes\",\n", " training_frame =train,\n", " validation_frame=test)\n", "\n", " gbm_elapsed = time.time() - s\n", "\n", " # Run DRF\n", " s = time.time()\n", " \n", " drf0 = H2ORandomForestEstimator(ntrees=250, max_depth=30)\n", "\n", " drf0.train(x =bike_names_x,\n", " y =\"bikes\",\n", " training_frame =train,\n", " validation_frame=test)\n", " \n", " drf_elapsed = time.time() - s \n", " \n", " \n", " # Run GLM\n", " if \"WC1\" in bike_names_x: bike_names_x.remove(\"WC1\")\n", " s = time.time()\n", "\n", " glm0 = H2OGeneralizedLinearEstimator(Lambda=[1e-5], family=\"poisson\")\n", " \n", " glm0.train(x =bike_names_x,\n", " y =\"bikes\",\n", " training_frame =train,\n", " validation_frame=test)\n", "\n", " glm_elapsed = time.time() - s\n", " \n", " # Run DL\n", " s = time.time()\n", "\n", " dl0 = H2ODeepLearningEstimator(hidden=[50,50,50,50], epochs=50)\n", " \n", " dl0.train(x =bike_names_x,\n", " y =\"bikes\",\n", " training_frame =train,\n", " validation_frame=test)\n", " \n", " dl_elapsed = time.time() - s\n", " \n", " # ----------\n", " # 4- Score on holdout set & report\n", " train_mse_gbm = gbm0.model_performance(train).mse()\n", " test_mse_gbm = gbm0.model_performance(test ).mse()\n", " hold_mse_gbm = gbm0.model_performance(hold ).mse()\n", "# print \"GBM mse TRAIN=\",train_mse_gbm,\", mse TEST=\",test_mse_gbm,\", mse HOLDOUT=\",hold_mse_gbm\n", " \n", " train_mse_drf = drf0.model_performance(train).mse()\n", " test_mse_drf = drf0.model_performance(test ).mse()\n", " hold_mse_drf = drf0.model_performance(hold ).mse()\n", "# print \"DRF mse TRAIN=\",train_mse_drf,\", mse TEST=\",test_mse_drf,\", mse HOLDOUT=\",hold_mse_drf\n", " \n", " train_mse_glm = glm0.model_performance(train).mse()\n", " test_mse_glm = glm0.model_performance(test ).mse()\n", " hold_mse_glm = glm0.model_performance(hold ).mse()\n", "# print \"GLM mse TRAIN=\",train_mse_glm,\", mse TEST=\",test_mse_glm,\", mse HOLDOUT=\",hold_mse_glm\n", " \n", " train_mse_dl = dl0.model_performance(train).mse()\n", " test_mse_dl = dl0.model_performance(test ).mse()\n", " hold_mse_dl = dl0.model_performance(hold ).mse()\n", "# print \" DL mse TRAIN=\",train_mse_dl,\", mse TEST=\",test_mse_dl,\", mse HOLDOUT=\",hold_mse_dl\n", " \n", " # make a pretty HTML table printout of the results\n", "\n", " header = [\"Model\", \"mse TRAIN\", \"mse TEST\", \"mse HOLDOUT\", \"Model Training Time (s)\"]\n", " table = [\n", " [\"GBM\", train_mse_gbm, test_mse_gbm, hold_mse_gbm, round(gbm_elapsed,3)],\n", " [\"DRF\", train_mse_drf, test_mse_drf, hold_mse_drf, round(drf_elapsed,3)],\n", " [\"GLM\", train_mse_glm, test_mse_glm, hold_mse_glm, round(glm_elapsed,3)],\n", " [\"DL \", train_mse_dl, test_mse_dl, hold_mse_dl , round(dl_elapsed,3) ],\n", " ]\n", " return h2o.display.H2OTableDisplay(table, columns_labels=header)\n", " # --------------" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training data has 5 columns and 83197 rows, test has 41792 rows, holdout has 13806\n", "gbm Model Build progress: |███████████████████████████████████████████████| 100%\n", "drf Model Build progress: |███████████████████████████████████████████████| 100%\n", "glm Model Build progress: |███████████████████████████████████████████████| 100%\n", "deeplearning Model Build progress: |██████████████████████████████████████| 100%\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Modelmse TRAINmse TESTmse HOLDOUTModel Training Time (s)
GBM144.1978206337.7233790324.867673112.454
DRF599.8996508723.9889710685.017730319.764
GLM967.1714929957.3634230938.14397300.454
DL 529.4687702622.4935714607.327402559.935
" ], "text/plain": [ "Model mse TRAIN mse TEST mse HOLDOUT Model Training Time (s)\n", "------- ----------- ---------- ------------- -------------------------\n", "GBM 144.198 337.723 324.868 12.454\n", "DRF 599.9 723.989 685.018 19.764\n", "GLM 967.171 957.363 938.144 0.454\n", "DL 529.469 622.494 607.327 59.935" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Split the data (into test & train), fit some models and predict on the holdout data\n", "split_fit_predict(bpd)\n", "# Here we see an r^2 of 0.91 for GBM, and 0.71 for GLM. This means given just\n", "# the station, the month, and the day-of-week we can predict 90% of the\n", "# variance of the bike-trip-starts." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Parse progress: |█████████████████████████████████████████████████████████| 100%\n", "Rows:17520\n", "Cols:50\n", "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Year Local Month Local Day Local Hour Local Year UTC Month UTC Day UTC Hour UTC Cavok Reported Cloud Ceiling (m) Cloud Cover Fraction Cloud Cover Fraction 1 Cloud Cover Fraction 2 Cloud Cover Fraction 3 Cloud Cover Fraction 4 Cloud Cover Fraction 5 Cloud Cover Fraction 6 Cloud Height (m) 1 Cloud Height (m) 2 Cloud Height (m) 3 Cloud Height (m) 4 Cloud Height (m) 5 Cloud Height (m) 6 Dew Point (C) Humidity Fraction Precipitation One Hour (mm) Pressure Altimeter (mbar) Pressure Sea Level (mbar) Pressure Station (mbar) Snow Depth (cm) Temperature (C) Visibility (km) Weather Code 1 Weather Code 1/ Description Weather Code 2 Weather Code 2/ Description Weather Code 3 Weather Code 3/ Description Weather Code 4 Weather Code 4/ Description Weather Code 5 Weather Code 5/ Description Weather Code 6 Weather Code 6/ Description Weather Code Most Severe / Icon Code Weather Code Most Severe Weather Code Most Severe / Description Wind Direction (degrees) Wind Gust (m/s) Wind Speed (m/s)
type int int int int int int int int int real real real real real int int int real real real int int int real real real real int int int real real int enum int enum int enum int enum int enum int enum int int enum int real real
mins 2013.0 1.0 1.0 0.0 2013.0 1.0 1.0 0.0 0.0 61.0 0.0 0.0 0.25 0.5 NaN NaN NaN 60.96 213.36 365.76 NaN NaN NaN -26.7000000000000030.125100000000000020.0 983.2949000000001 NaN NaN NaN -15.6000000000000010.001 1.0 1.0 1.0 1.0 1.0 3.0 0.0 1.0 10.0 7.2 0.0
mean 2013.5 6.526027397260274515.7205479452054811.5000000000000042013.50057077625576.52511415525114115.7213470319634711.5001141552511420.0 1306.3119584569736 0.4167424905220181 0.3612073490813649 0.8724453840732911 0.9630456852791879 0.0 0.0 0.0 1293.9822681953192 1643.7390016566796 2084.8938637563456 0.0 0.0 0.0 4.313046467655992 0.5967363891594567 1.3799301075268817 1017.8258144055944 0.0 0.0 0.0 12.578909070073914 14.3914429682020094.84251968503937 3.6586768935762226 2.8466076696165192 2.0114942528735633 4.125 3.0 1.3784817351598173 4.84251968503937 194.69525681985743 9.422169480726348 2.4103288784874057
maxs 2014.0 12.0 31.0 23.0 2015.0 12.0 31.0 23.0 0.0 3657.6000000000004 1.0 1.0 1.0 1.0 NaN NaN NaN 3657.5999 3657.5999 3657.5999 NaN NaN NaN 24.400000000000002 1.0 26.924 1042.2113 NaN NaN NaN 36.1 16.0934 60.0 60.0 36.0 27.0 27.0 3.0 16.0 60.0 360.0 20.58000000000000210.8
sigma 0.5000142700172623.447949723847773 8.7964980485232726.922384111875021 0.50058441171579 3.4478240545776478.7956148886847176.922301652025526 0.0 995.3398569657211 0.4627208309925301 0.42770569708047684 0.19715569036704708 0.08610155981044185 -0.0 -0.0 -0.0 962.7430958537232 916.7386134899587 887.2158475113932 -0.0 -0.0 -0.0 10.973128209713666 0.185792011865734962.5621512917896463 7.464516971789659 -0.0 -0.0 -0.0 10.039673953091574 3.69893623033404945.704865769828319 6.133862539123368 5.805532863642112 3.1234084426128437 6.15223536610881 0.0 4.073860627017756 5.704865769828319 106.3500000314393 1.81511871115241541.614697905241178
zeros 0 0 0 730 0 0 0 730 17455 0 8758 8758 0 0 0 0 0 0 0 0 0 0 0 268 0 501 0 0 0 0 269 0 0 0 0 0 0 0 14980 0 0 0 2768
missing0 0 0 0 0 0 0 0 65 10780 375 375 14682 16535 17520 17520 17520 9103 14683 16535 17520 17520 17520 67 67 15660 360 17520 17520 17520 67 412 14980 14980 16477 16477 17181 17181 17433 17433 17504 17504 17518 17518 0 14980 14980 9382 14381 1283
0 2013.0 1.0 1.0 0.0 2013.0 1.0 1.0 5.0 0.0 2895.6000000000004 1.0 0.9 1.0 nan nan nan nan 2895.5999 3352.8 nan nan nan nan -5.0 0.5447000000000001 nan 1013.0917000000001 nan nan nan 3.3000000000000003 16.0934 nan nan nan nan nan nan 0.0 nan nan nan 2.57
1 2013.0 1.0 1.0 1.0 2013.0 1.0 1.0 6.0 0.0 3048.0 1.0 1.0 nan nan nan nan nan 3048.0 nan nan nan nan nan -4.4 0.5463 nan 1012.0759 nan nan nan 3.9000000000000004 16.0934 nan nan nan nan nan nan 0.0 nan 260.0 9.77 4.63
2 2013.0 1.0 1.0 2.0 2013.0 1.0 1.0 7.0 0.0 1828.8000000000002 1.0 1.0 nan nan nan nan nan 1828.7999 nan nan nan nan nan -3.30000000000000030.619 nan 1012.4145000000001 nan nan nan 3.3000000000000003 16.0934 nan nan nan nan nan nan 0.0 nan nan 7.72 1.54
3 2013.0 1.0 1.0 3.0 2013.0 1.0 1.0 8.0 0.0 1463.0 1.0 1.0 nan nan nan nan nan 1463.04 nan nan nan nan nan -2.80000000000000030.6159 nan 1012.4145000000001 nan nan nan 3.9000000000000004 16.0934 nan nan nan nan nan nan 0.0 nan nan nan 3.09
4 2013.0 1.0 1.0 4.0 2013.0 1.0 1.0 9.0 0.0 1402.1000000000001 1.0 1.0 nan nan nan nan nan 1402.0800000000002 nan nan nan nan nan -2.80000000000000030.6159 nan 1012.7531 nan nan nan 3.9000000000000004 16.0934 nan nan nan nan nan nan 0.0 nan 260.0 nan 4.12
5 2013.0 1.0 1.0 5.0 2013.0 1.0 1.0 10.0 0.0 1524.0 1.0 1.0 nan nan nan nan nan 1524.0 nan nan nan nan nan -2.80000000000000030.6159 nan 1012.4145000000001 nan nan nan 3.9000000000000004 16.0934 nan nan nan nan nan nan 0.0 nan nan nan 3.09
6 2013.0 1.0 1.0 6.0 2013.0 1.0 1.0 11.0 0.0 1524.0 1.0 1.0 nan nan nan nan nan 1524.0 nan nan nan nan nan -3.30000000000000030.5934 nan 1012.0759 nan nan nan 3.9000000000000004 16.0934 nan nan nan nan nan nan 0.0 nan nan 9.26 3.09
7 2013.0 1.0 1.0 7.0 2013.0 1.0 1.0 12.0 0.0 1524.0 1.0 1.0 nan nan nan nan nan 1524.0 nan nan nan nan nan -3.30000000000000030.5934 nan 1012.4145000000001 nan nan nan 3.9000000000000004 16.0934 nan nan nan nan nan nan 0.0 nan 260.0 9.26 4.63
8 2013.0 1.0 1.0 8.0 2013.0 1.0 1.0 13.0 0.0 1524.0 1.0 1.0 nan nan nan nan nan 1524.0 nan nan nan nan nan -2.80000000000000030.6425000000000001 nan 1012.4145000000001 nan nan nan 3.3000000000000003 16.0934 nan nan nan nan nan nan 0.0 nan 260.0 nan 3.09
9 2013.0 1.0 1.0 9.0 2013.0 1.0 1.0 14.0 0.0 1524.0 1.0 0.9 1.0 nan nan nan nan 1524.0 3657.5999 nan nan nan nan -2.80000000000000030.6159 nan 1012.4145000000001 nan nan nan 3.9000000000000004 16.0934 nan nan nan nan nan nan 0.0 nan nan 9.26 3.09
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# ----------\n", "# 5- Now lets add some weather\n", "# Load weather data\n", "wthr1 = h2o.import_file(path=[mylocate(\"bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv\")])\n", "# Peek at the data\n", "wthr1.describe()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows:17520\n", "Cols:9\n", "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Year Local Month Local Day Local Hour Local Dew Point (C) Humidity Fraction Rain (mm) Temperature (C) WC1
type int int int int real real real real enum
mins 2013.0 1.0 1.0 0.0 -26.7000000000000030.125100000000000020.0 -15.600000000000001
mean 2013.5 6.526027397260274515.7205479452054811.5000000000000044.313046467655992 0.5967363891594567 1.379930107526881712.578909070073914
maxs 2014.0 12.0 31.0 23.0 24.400000000000002 1.0 26.924 36.1
sigma 0.5000142700172623.447949723847773 8.7964980485232726.922384111875021 10.973128209713666 0.185792011865734962.562151291789646310.039673953091574
zeros 0 0 0 730 268 0 501 269
missing0 0 0 0 67 67 15660 67 14980
0 2013.0 1.0 1.0 0.0 -5.0 0.5447000000000001 nan 3.3000000000000003
1 2013.0 1.0 1.0 1.0 -4.4 0.5463 nan 3.9000000000000004
2 2013.0 1.0 1.0 2.0 -3.30000000000000030.619 nan 3.3000000000000003
3 2013.0 1.0 1.0 3.0 -2.80000000000000030.6159 nan 3.9000000000000004
4 2013.0 1.0 1.0 4.0 -2.80000000000000030.6159 nan 3.9000000000000004
5 2013.0 1.0 1.0 5.0 -2.80000000000000030.6159 nan 3.9000000000000004
6 2013.0 1.0 1.0 6.0 -3.30000000000000030.5934 nan 3.9000000000000004
7 2013.0 1.0 1.0 7.0 -3.30000000000000030.5934 nan 3.9000000000000004
8 2013.0 1.0 1.0 8.0 -2.80000000000000030.6425000000000001 nan 3.3000000000000003
9 2013.0 1.0 1.0 9.0 -2.80000000000000030.6159 nan 3.9000000000000004
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Lots of columns in there! Lets plan on converting to time-since-epoch to do\n", "# a 'join' with the bike data, plus gather weather info that might affect\n", "# cyclists - rain, snow, temperature. Alas, drop the \"snow\" column since it's\n", "# all NA's. Also add in dew point and humidity just in case. Slice out just\n", "# the columns of interest and drop the rest.\n", "wthr2 = wthr1[[\"Year Local\",\"Month Local\",\"Day Local\",\"Hour Local\",\"Dew Point (C)\",\"Humidity Fraction\",\"Precipitation One Hour (mm)\",\"Temperature (C)\",\"Weather Code 1/ Description\"]]\n", "\n", "wthr2.set_name(wthr2.names.index(\"Precipitation One Hour (mm)\"), \"Rain (mm)\")\n", "wthr2.set_name(wthr2.names.index(\"Weather Code 1/ Description\"), \"WC1\")\n", "wthr2.describe()\n", "# Much better! " ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Filter down to the weather at Noon\n", "wthr3 = wthr2[ wthr2[\"Hour Local\"]==12 ]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows:730\n", "Cols:11\n", "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Year Local Month Local Day Local Hour Local Dew Point (C) Humidity Fraction Rain (mm) Temperature (C) WC1 msec Days
type int int int int real real real real enum int int
mins 2013.0 1.0 1.0 12.0 -26.7000000000000030.1723 0.0 -13.9 1357070400000.0 15706.0
mean 2013.5 6.52602739726027415.7205479452054812.0 4.230123796423659 0.539728198074278 1.531257142857142914.068775790921595 1388560852602.739716070.5
maxs 2014.0 12.0 31.0 12.0 23.3 1.0 12.446 34.4 1420056000000.0 16435.0
sigma 0.50034281800391723.4502152930681498.8022780270096150.0 11.106296472475226 0.179945027923243272.360642486149058710.398985514891212 18219740080.410755210.87713642466474
zeros 0 0 0 0 14 0 15 7 0 0
missing0 0 0 0 3 3 660 3 620 0 0
0 2013.0 1.0 1.0 12.0 -3.30000000000000030.5934 nan 3.9000000000000004 1357070400000.0 15706.0
1 2013.0 1.0 2.0 12.0 -11.7000000000000010.4806 nan -2.2 1357156800000.0 15707.0
2 2013.0 1.0 3.0 12.0 -10.6000000000000010.5248 nan -2.2 1357243200000.0 15708.0
3 2013.0 1.0 4.0 12.0 -7.2 0.49760000000000004nan 2.2 1357329600000.0 15709.0
4 2013.0 1.0 5.0 12.0 -7.2 0.42600000000000005nan 4.4 1357416000000.0 15710.0
5 2013.0 1.0 6.0 12.0 -1.70000000000000020.6451 nan 4.4 haze 1357502400000.0 15711.0
6 2013.0 1.0 7.0 12.0 -6.10000000000000050.41190000000000004nan 6.1000000000000005 1357588800000.0 15712.0
7 2013.0 1.0 8.0 12.0 -1.70000000000000020.5314 nan 7.2 1357675200000.0 15713.0
8 2013.0 1.0 9.0 12.0 0.6000000000000001 0.56 nan 8.9 haze 1357761600000.0 15714.0
9 2013.0 1.0 10.0 12.0 -6.10000000000000050.3952 nan 6.7 1357848000000.0 15715.0
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Lets now get Days since the epoch... we'll convert year/month/day into Epoch\n", "# time, and then back to Epoch days. Need zero-based month and days, but have\n", "# 1-based.\n", "wthr3[\"msec\"] = h2o.H2OFrame.mktime(year=wthr3[\"Year Local\"], month=wthr3[\"Month Local\"]-1, day=wthr3[\"Day Local\"]-1, hour=wthr3[\"Hour Local\"])\n", "secsPerDay=1000*60*60*24\n", "wthr3[\"Days\"] = (wthr3[\"msec\"]/secsPerDay).floor()\n", "wthr3.describe()\n", "# msec looks sane (numbers like 1.3e12 are in the correct range for msec since\n", "# 1970). Epoch Days matches closely with the epoch day numbers from the\n", "# CitiBike dataset. " ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Lets drop off the extra time columns to make a easy-to-handle dataset.\n", "wthr4 = wthr3.drop(\"Year Local\").drop(\"Month Local\").drop(\"Day Local\").drop(\"Hour Local\").drop(\"msec\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Also, most rain numbers are missing - lets assume those are zero rain days\n", "rain = wthr4[\"Rain (mm)\"]\n", "rain[ rain.isna() ] = 0\n", "wthr4[\"Rain (mm)\"] = rain" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Merge Daily Weather with Bikes-Per-Day\n", "Rows:138795\n", "Cols:10\n", "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Days start station name bikes Month DayOfWeek Dew Point (C) Humidity Fraction Rain (mm) Temperature (C) WC1
type int enum int enum enum real real real real enum
mins 15887.0 1.0 -26.7000000000000030.1723 0.0 -13.9
mean 16099.488166000216 74.98502107424619 5.451762870514826 0.5321604665675 0.0850326740876833115.614519464499507
maxs 16313.0 668.0 23.3 1.0 8.382 34.4
sigma 123.39632568805678 64.73186265524505 11.723905010415397 0.1784104767702021 0.5764961942157182 10.928653577314824
zeros 0 0 1956 0 130793 1567
missing0 0 0 0 0 980 980 0 980 118772
0 15887.0 1 Ave & E 15 St 74.0 7 Mon 21.700000000000003 0.9354 4.572 22.8 rain
1 15887.0 1 Ave & E 18 St 51.0 7 Mon 21.700000000000003 0.9354 4.572 22.8 rain
2 15887.0 1 Ave & E 30 St 66.0 7 Mon 21.700000000000003 0.9354 4.572 22.8 rain
3 15887.0 1 Ave & E 44 St 56.0 7 Mon 21.700000000000003 0.9354 4.572 22.8 rain
4 15887.0 10 Ave & W 28 St 51.0 7 Mon 21.700000000000003 0.9354 4.572 22.8 rain
5 15887.0 11 Ave & W 27 St 65.0 7 Mon 21.700000000000003 0.9354 4.572 22.8 rain
6 15887.0 11 Ave & W 41 St 53.0 7 Mon 21.700000000000003 0.9354 4.572 22.8 rain
7 15887.0 12 Ave & W 40 St 36.0 7 Mon 21.700000000000003 0.9354 4.572 22.8 rain
8 15887.0 2 Ave & E 31 St 96.0 7 Mon 21.700000000000003 0.9354 4.572 22.8 rain
9 15887.0 2 Ave & E 58 St 103.0 7 Mon 21.700000000000003 0.9354 4.572 22.8 rain
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Daysstart station name bikes MonthDayOfWeek Dew Point (C) Humidity Fraction Rain (mm) Temperature (C)WC1
158871 Ave & E 15 St 74 7Mon 21.7 0.9354 4.572 22.8rain
158871 Ave & E 18 St 51 7Mon 21.7 0.9354 4.572 22.8rain
158871 Ave & E 30 St 66 7Mon 21.7 0.9354 4.572 22.8rain
158871 Ave & E 44 St 56 7Mon 21.7 0.9354 4.572 22.8rain
1588710 Ave & W 28 St 51 7Mon 21.7 0.9354 4.572 22.8rain
1588711 Ave & W 27 St 65 7Mon 21.7 0.9354 4.572 22.8rain
1588711 Ave & W 41 St 53 7Mon 21.7 0.9354 4.572 22.8rain
1588712 Ave & W 40 St 36 7Mon 21.7 0.9354 4.572 22.8rain
158872 Ave & E 31 St 96 7Mon 21.7 0.9354 4.572 22.8rain
158872 Ave & E 58 St 103 7Mon 21.7 0.9354 4.572 22.8rain
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# ----------\n", "# 6 - Join the weather data-per-day to the bike-starts-per-day\n", "print(\"Merge Daily Weather with Bikes-Per-Day\")\n", "bpd_with_weather = bpd.merge(wthr4,all_x=True,all_y=False)\n", "bpd_with_weather.describe()\n", "bpd_with_weather.show()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training data has 10 columns and 83276 rows, test has 41631 rows, holdout has 13888\n", "gbm Model Build progress: |███████████████████████████████████████████████| 100%\n", "drf Model Build progress: |███████████████████████████████████████████████| 100%\n", "glm Model Build progress: |███████████████████████████████████████████████| 100%\n", "deeplearning Model Build progress: |██████████████████████████████████████| 100%\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Modelmse TRAINmse TESTmse HOLDOUTModel Training Time (s)
GBM120.3078530291.8993667285.058503319.141
DRF117.6329195388.7562018385.695565788.24
GLM862.8161359879.2555091875.88169610.449
DL 238.1676790340.7971229351.040363663.611
" ], "text/plain": [ "Model mse TRAIN mse TEST mse HOLDOUT Model Training Time (s)\n", "------- ----------- ---------- ------------- -------------------------\n", "GBM 120.308 291.899 285.059 19.141\n", "DRF 117.633 388.756 385.696 88.24\n", "GLM 862.816 879.256 875.882 0.449\n", "DL 238.168 340.797 351.04 63.611" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# 7 - Test/Train split again, model build again, this time with weather\n", "split_fit_predict(bpd_with_weather)" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }