{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "import h2o\n", "import time\n", "from h2o.estimators.glm import H2OGeneralizedLinearEstimator\n", "from h2o.estimators.gbm import H2OGradientBoostingEstimator\n", "from h2o.estimators.random_forest import H2ORandomForestEstimator\n", "from h2o.estimators.deeplearning import H2ODeepLearningEstimator" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Checking whether there is an H2O instance running at http://localhost:54321. connected.\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
H2O cluster uptime:08 secs
H2O cluster version:3.11.0.99999
H2O cluster version age:1 minute
H2O cluster name:pasha
H2O cluster total nodes:1
H2O cluster free memory:3.556 Gb
H2O cluster total cores:8
H2O cluster allowed cores:8
H2O cluster status:accepting new members, healthy
H2O connection url:http://localhost:54321
H2O connection proxy:None
Python version:3.5.2 final
" ], "text/plain": [ "-------------------------- ------------------------------\n", "H2O cluster uptime: 08 secs\n", "H2O cluster version: 3.11.0.99999\n", "H2O cluster version age: 1 minute\n", "H2O cluster name: pasha\n", "H2O cluster total nodes: 1\n", "H2O cluster free memory: 3.556 Gb\n", "H2O cluster total cores: 8\n", "H2O cluster allowed cores: 8\n", "H2O cluster status: accepting new members, healthy\n", "H2O connection url: http://localhost:54321\n", "H2O connection proxy:\n", "Python version: 3.5.2 final\n", "-------------------------- ------------------------------" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Explore a typical Data Science workflow with H2O and Python\n", "#\n", "# Goal: assist the manager of CitiBike of NYC to load-balance the bicycles\n", "# across the CitiBike network of stations, by predicting the number of bike\n", "# trips taken from the station every day. Use 10 million rows of historical\n", "# data, and eventually add weather data.\n", "\n", "\n", "# Connect to a cluster\n", "h2o.init()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.\n", "\n", "# Set this to True if you want to fetch the data directly from S3.\n", "# This is useful if your cluster is running in EC2.\n", "data_source_is_s3 = False\n", "\n", "def mylocate(s):\n", " if data_source_is_s3:\n", " return \"s3n://h2o-public-test-data/\" + s\n", " else:\n", " return _locate(s)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Import and Parse bike data\n", "Parse progress: |█████████████████████████████████████████████████████████| 100%\n" ] } ], "source": [ "# Pick either the big or the small demo.\n", "# Big data is 10M rows\n", "small_test = [mylocate(\"bigdata/laptop/citibike-nyc/2013-10.csv\")]\n", "big_test = [mylocate(\"bigdata/laptop/citibike-nyc/2013-07.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-08.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-09.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-10.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-11.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-12.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-01.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-02.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-03.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-04.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-05.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-06.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-07.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-08.csv\")]\n", "\n", "# ----------\n", "\n", "# 1- Load data - 1 row per bicycle trip. Has columns showing the start and end\n", "# station, trip duration and trip start time and day. The larger dataset\n", "# totals about 10 million rows\n", "print(\"Import and Parse bike data\")\n", "data = h2o.import_file(path=small_test)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows:1037712\n", "Cols:16\n", "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
tripduration starttime stoptime start station id start station name start station latitude start station longitude end station id end station name end station latitude end station longitude bikeid usertype birth year gender Days
type int time time int enum real real int enum real real int enum int int int
mins 60.0 1380585668000.0 1380585883000.0 72.0 40.680342423 -74.01713445 72.0 40.680342423 -74.01713445 14529.0 1899.0 0.0 15979.0
mean 825.6147543827192 1381888516917.714 1381889342532.4746 443.7142126139049 40.73451885864454 -73.99113288482197 443.20742171238254 40.73428478848875 -73.99127029824423 17644.071645119242 1975.77839486018391.12375591686325215993.476745956474
maxs 1259480.0 1383263997000.0 1383393310000.0 3002.0 40.770513 -73.9500479759 3002.0 40.770513 -73.9500479759 20757.0 1997.0 2.0 16009.0
sigma 2000.3732322961862778871729.1323168 778847387.5037588 354.43432507453724 0.01957340730530415 0.012316123410581171 357.39821705755827 0.019557845811587957 0.012385581196537298 1717.6811213447866 11.1314906238349420.5443805932910099.014533519116712
zeros 0 0 0 0 0 0 0 0 0 0 0 97498 0
missing0 0 0 0 0 0 0 0 0 0 0 0 0 97445 0 0
0 326.0 2013-10-01 00:01:082013-10-01 00:06:34239.0 Willoughby St & Fleet St40.69196566 -73.9813018 366.0 Clinton Ave & Myrtle Ave 40.693261 -73.968896 16052.0 Subscriber1982.0 1.0 15979.0
1 729.0 2013-10-01 00:01:212013-10-01 00:13:30322.0 Clinton St & Tillary St 40.696191999999996 -73.991218 398.0 Atlantic Ave & Furman St 40.69165183 -73.99997859999999 19412.0 Customer nan 0.0 15979.0
2 520.0 2013-10-01 00:01:242013-10-01 00:10:04174.0 E 25 St & 1 Ave 40.7381765 -73.97738662 403.0 E 2 St & 2 Ave 40.72502876 -73.99069656 19645.0 Subscriber1984.0 1.0 15979.0
3 281.0 2013-10-01 00:01:252013-10-01 00:06:06430.0 York St & Jay St 40.7014851 -73.98656928 323.0 Lawrence St & Willoughby St 40.69236178 -73.98631746 16992.0 Subscriber1985.0 1.0 15979.0
4 196.0 2013-10-01 00:01:272013-10-01 00:04:43403.0 E 2 St & 2 Ave 40.72502876 -73.99069656 401.0 Allen St & Rivington St 40.72019576 -73.98997825000001 15690.0 Subscriber1986.0 1.0 15979.0
5 1948.0 2013-10-01 00:01:482013-10-01 00:34:16369.0 Washington Pl & 6 Ave 40.73224119 -74.00026394 307.0 Canal St & Rutgers St 40.714274870000004 -73.98990025 19846.0 Subscriber1977.0 1.0 15979.0
6 1327.0 2013-10-01 00:01:482013-10-01 00:23:55254.0 W 11 St & 6 Ave 40.73532427 -73.99800419 539.0 Metropolitan Ave & Bedford Ave40.71534825 -73.96024116 14563.0 Subscriber1986.0 2.0 15979.0
7 1146.0 2013-10-01 00:01:572013-10-01 00:21:03490.0 8 Ave & W 33 St 40.751551 -73.993934 438.0 St Marks Pl & 1 Ave 40.727791260000004 -73.98564945 16793.0 Subscriber1959.0 1.0 15979.0
8 380.0 2013-10-01 00:01:582013-10-01 00:08:18468.0 Broadway & W 55 St 40.7652654 -73.98192338 385.0 E 55 St & 2 Ave 40.757973220000004 -73.96603308 16600.0 Customer nan 0.0 15979.0
9 682.0 2013-10-01 00:02:052013-10-01 00:13:27300.0 Shevchenko Pl & E 6 St 40.728145 -73.990214 519.0 Pershing Square N 40.75188406 -73.97770164 15204.0 Subscriber1992.0 1.0 15979.0
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# ----------\n", "\n", "# 2- light data munging: group the bike starts per-day, converting the 10M rows\n", "# of trips to about 140,000 station&day combos - predicting the number of trip\n", "# starts per-station-per-day.\n", "\n", "# Convert start time to: Day since the Epoch\n", "startime = data[\"starttime\"]\n", "secsPerDay = 1000 * 3600 * 24\n", "data[\"Days\"] = (startime.asnumeric() / secsPerDay).floor()\n", "data.describe()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Daysstart station name bikes
159791 Ave & E 15 St 173
159791 Ave & E 18 St 118
159791 Ave & E 30 St 152
1597910 Ave & W 28 St 115
1597911 Ave & W 27 St 210
1597911 Ave & W 41 St 106
1597912 Ave & W 40 St 144
159792 Ave & E 31 St 206
159792 Ave & E 58 St 105
159793 Ave & Schermerhorn St 15
" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Rows:10131\n", "Cols:3\n", "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Days start station name bikes
type int enum int
mins 15979.0 1.0
mean 15993.953311617806 102.42937518507551
maxs 16009.0 603.0
sigma 8.950698111468864 74.05933443246006
zeros 0 0
missing0 0 0
0 15979.0 1 Ave & E 15 St 173.0
1 15979.0 1 Ave & E 18 St 118.0
2 15979.0 1 Ave & E 30 St 152.0
3 15979.0 10 Ave & W 28 St 115.0
4 15979.0 11 Ave & W 27 St 210.0
5 15979.0 11 Ave & W 41 St 106.0
6 15979.0 12 Ave & W 40 St 144.0
7 15979.0 2 Ave & E 31 St 206.0
8 15979.0 2 Ave & E 58 St 105.0
9 15979.0 3 Ave & Schermerhorn St15.0
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[10131, 3]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Now do a monster Group-By. Count bike starts per-station per-day. Ends up\n", "# with about 340 stations times 400 days (140,000 rows). This is what we want\n", "# to predict.\n", "grouped = data.group_by([\"Days\",\"start station name\"])\n", "bpd = grouped.count().get_frame() # Compute bikes-per-day\n", "bpd.set_name(2,\"bikes\")\n", "bpd.show()\n", "bpd.describe()\n", "bpd.dim" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quantiles of bikes-per-day\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Probs bikesQuantiles
0.01 5
0.1 20
0.25 45
0.333 60
0.5 91
0.667 121
0.75 141
0.9 197
0.99 340.4
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Quantiles: the data is fairly unbalanced; some station/day combos are wildly\n", "# more popular than others.\n", "print(\"Quantiles of bikes-per-day\")\n", "bpd[\"bikes\"].quantile().show()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Bikes-Per-Day\n", "Rows:10131\n", "Cols:5\n", "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Days start station name bikes Month DayOfWeek
type int enum int enum enum
mins 15979.0 1.0
mean 15993.953311617806 102.42937518507551
maxs 16009.0 603.0
sigma 8.950698111468864 74.05933443246006
zeros 0 0
missing0 0 0 0 0
0 15979.0 1 Ave & E 15 St 173.0 10 Tue
1 15979.0 1 Ave & E 18 St 118.0 10 Tue
2 15979.0 1 Ave & E 30 St 152.0 10 Tue
3 15979.0 10 Ave & W 28 St 115.0 10 Tue
4 15979.0 11 Ave & W 27 St 210.0 10 Tue
5 15979.0 11 Ave & W 41 St 106.0 10 Tue
6 15979.0 12 Ave & W 40 St 144.0 10 Tue
7 15979.0 2 Ave & E 31 St 206.0 10 Tue
8 15979.0 2 Ave & E 58 St 105.0 10 Tue
9 15979.0 3 Ave & Schermerhorn St15.0 10 Tue
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# A little feature engineering\n", "# Add in month-of-year (seasonality; fewer bike rides in winter than summer)\n", "secs = bpd[\"Days\"]*secsPerDay\n", "bpd[\"Month\"] = secs.month().asfactor()\n", "# Add in day-of-week (work-week; more bike rides on Sunday than Monday)\n", "bpd[\"DayOfWeek\"] = secs.dayOfWeek()\n", "print(\"Bikes-Per-Day\")\n", "bpd.describe()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# ----------\n", "# 3- Fit a model on train; using test as validation\n", "\n", "# Function for doing class test/train/holdout split\n", "def split_fit_predict(data):\n", " global gbm0,drf0,glm0,dl0\n", " # Classic Test/Train split\n", " r = data['Days'].runif() # Random UNIForm numbers, one per row\n", " train = data[ r < 0.6]\n", " test = data[(0.6 <= r) & (r < 0.9)]\n", " hold = data[ 0.9 <= r ]\n", " print(\"Training data has\",train.ncol,\"columns and\",train.nrow,\"rows, test has\",test.nrow,\"rows, holdout has\",hold.nrow)\n", " bike_names_x = data.names\n", " bike_names_x.remove(\"bikes\")\n", " \n", " # Run GBM\n", " s = time.time()\n", " \n", " gbm0 = H2OGradientBoostingEstimator(ntrees=500, # 500 works well\n", " max_depth=6,\n", " learn_rate=0.1)\n", " \n", "\n", " gbm0.train(x =bike_names_x,\n", " y =\"bikes\",\n", " training_frame =train,\n", " validation_frame=test)\n", "\n", " gbm_elapsed = time.time() - s\n", "\n", " # Run DRF\n", " s = time.time()\n", " \n", " drf0 = H2ORandomForestEstimator(ntrees=250, max_depth=30)\n", "\n", " drf0.train(x =bike_names_x,\n", " y =\"bikes\",\n", " training_frame =train,\n", " validation_frame=test)\n", " \n", " drf_elapsed = time.time() - s \n", " \n", " \n", " # Run GLM\n", " if \"WC1\" in bike_names_x: bike_names_x.remove(\"WC1\")\n", " s = time.time()\n", "\n", " glm0 = H2OGeneralizedLinearEstimator(Lambda=[1e-5], family=\"poisson\")\n", " \n", " glm0.train(x =bike_names_x,\n", " y =\"bikes\",\n", " training_frame =train,\n", " validation_frame=test)\n", "\n", " glm_elapsed = time.time() - s\n", " \n", " # Run DL\n", " s = time.time()\n", "\n", " dl0 = H2ODeepLearningEstimator(hidden=[50,50,50,50], epochs=50)\n", " \n", " dl0.train(x =bike_names_x,\n", " y =\"bikes\",\n", " training_frame =train,\n", " validation_frame=test)\n", " \n", " dl_elapsed = time.time() - s\n", " \n", " # ----------\n", " # 4- Score on holdout set & report\n", " train_mse_gbm = gbm0.model_performance(train).mse()\n", " test_mse_gbm = gbm0.model_performance(test ).mse()\n", " hold_mse_gbm = gbm0.model_performance(hold ).mse()\n", "# print \"GBM mse TRAIN=\",train_mse_gbm,\", mse TEST=\",test_mse_gbm,\", mse HOLDOUT=\",hold_mse_gbm\n", " \n", " train_mse_drf = drf0.model_performance(train).mse()\n", " test_mse_drf = drf0.model_performance(test ).mse()\n", " hold_mse_drf = drf0.model_performance(hold ).mse()\n", "# print \"DRF mse TRAIN=\",train_mse_drf,\", mse TEST=\",test_mse_drf,\", mse HOLDOUT=\",hold_mse_drf\n", " \n", " train_mse_glm = glm0.model_performance(train).mse()\n", " test_mse_glm = glm0.model_performance(test ).mse()\n", " hold_mse_glm = glm0.model_performance(hold ).mse()\n", "# print \"GLM mse TRAIN=\",train_mse_glm,\", mse TEST=\",test_mse_glm,\", mse HOLDOUT=\",hold_mse_glm\n", " \n", " train_mse_dl = dl0.model_performance(train).mse()\n", " test_mse_dl = dl0.model_performance(test ).mse()\n", " hold_mse_dl = dl0.model_performance(hold ).mse()\n", "# print \" DL mse TRAIN=\",train_mse_dl,\", mse TEST=\",test_mse_dl,\", mse HOLDOUT=\",hold_mse_dl\n", " \n", " # make a pretty HTML table printout of the results\n", "\n", " header = [\"Model\", \"mse TRAIN\", \"mse TEST\", \"mse HOLDOUT\", \"Model Training Time (s)\"]\n", " table = [\n", " [\"GBM\", train_mse_gbm, test_mse_gbm, hold_mse_gbm, round(gbm_elapsed,3)],\n", " [\"DRF\", train_mse_drf, test_mse_drf, hold_mse_drf, round(drf_elapsed,3)],\n", " [\"GLM\", train_mse_glm, test_mse_glm, hold_mse_glm, round(glm_elapsed,3)],\n", " [\"DL \", train_mse_dl, test_mse_dl, hold_mse_dl , round(dl_elapsed,3) ],\n", " ]\n", " h2o.display.H2ODisplay(table,header)\n", " # --------------" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training data has 5 columns and 6180 rows, test has 2947 rows, holdout has 1004\n", "gbm Model Build progress: |███████████████████████████████████████████████| 100%\n", "drf Model Build progress: |███████████████████████████████████████████████| 100%\n", "glm Model Build progress: |███████████████████████████████████████████████| 100%\n", "deeplearning Model Build progress: |██████████████████████████████████████| 100%\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Modelmse TRAINmse TESTmse HOLDOUTModel Training Time (s)
GBM0.8948171386.7584398428.72371207.759
DRF526.3541524921.4867812916.50913618.673
GLM689.6647078757.4271445726.97645300.522
DL 307.5692122459.6025357509.28220868.619
" ], "text/plain": [ "Model mse TRAIN mse TEST mse HOLDOUT Model Training Time (s)\n", "------- ----------- ---------- ------------- -------------------------\n", "GBM 0.894817 386.758 428.724 7.759\n", "DRF 526.354 921.487 916.509 8.673\n", "GLM 689.665 757.427 726.976 0.522\n", "DL 307.569 459.603 509.282 8.619" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Split the data (into test & train), fit some models and predict on the holdout data\n", "split_fit_predict(bpd)\n", "# Here we see an r^2 of 0.91 for GBM, and 0.71 for GLM. This means given just\n", "# the station, the month, and the day-of-week we can predict 90% of the\n", "# variance of the bike-trip-starts." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Parse progress: |█████████████████████████████████████████████████████████| 100%\n", "Rows:17520\n", "Cols:50\n", "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Year Local Month Local Day Local Hour Local Year UTC Month UTC Day UTC Hour UTC Cavok Reported Cloud Ceiling (m) Cloud Cover Fraction Cloud Cover Fraction 1 Cloud Cover Fraction 2 Cloud Cover Fraction 3 Cloud Cover Fraction 4 Cloud Cover Fraction 5 Cloud Cover Fraction 6 Cloud Height (m) 1 Cloud Height (m) 2 Cloud Height (m) 3 Cloud Height (m) 4 Cloud Height (m) 5 Cloud Height (m) 6 Dew Point (C) Humidity Fraction Precipitation One Hour (mm) Pressure Altimeter (mbar) Pressure Sea Level (mbar) Pressure Station (mbar) Snow Depth (cm) Temperature (C) Visibility (km) Weather Code 1 Weather Code 1/ Description Weather Code 2 Weather Code 2/ Description Weather Code 3 Weather Code 3/ Description Weather Code 4 Weather Code 4/ Description Weather Code 5 Weather Code 5/ Description Weather Code 6 Weather Code 6/ Description Weather Code Most Severe / Icon Code Weather Code Most Severe Weather Code Most Severe / Description Wind Direction (degrees) Wind Gust (m/s) Wind Speed (m/s)
type int int int int int int int int int real real real real real int int int real real real int int int real real real real int int int real real int enum int enum int enum int enum int enum int enum int int enum int real real
mins 2013.0 1.0 1.0 0.0 2013.0 1.0 1.0 0.0 0.0 61.0 0.0 0.0 0.25 0.5 NaN NaN NaN 60.96 213.36 365.76 NaN NaN NaN -26.7000000000000030.125100000000000020.0 983.2949000000001 NaN NaN NaN -15.6000000000000010.001 1.0 1.0 1.0 1.0 1.0 3.0 0.0 1.0 10.0 7.2 0.0
mean 2013.5 6.526027397260274515.7205479452054811.5000000000000042013.50057077625576.52511415525114115.7213470319634711.5001141552511420.0 1306.3119584569736 0.4167424905220181 0.3612073490813649 0.8724453840732911 0.9630456852791879 0.0 0.0 0.0 1293.9822681953192 1643.7390016566796 2084.8938637563456 0.0 0.0 0.0 4.313046467655992 0.5967363891594567 1.3799301075268817 1017.8258144055944 0.0 0.0 0.0 12.578909070073914 14.3914429682020094.84251968503937 3.6586768935762226 2.8466076696165192 2.0114942528735633 4.125 3.0 1.3784817351598173 4.84251968503937 194.69525681985743 9.422169480726348 2.4103288784874057
maxs 2014.0 12.0 31.0 23.0 2015.0 12.0 31.0 23.0 0.0 3657.6000000000004 1.0 1.0 1.0 1.0 NaN NaN NaN 3657.5999 3657.5999 3657.5999 NaN NaN NaN 24.400000000000002 1.0 26.924 1042.2113 NaN NaN NaN 36.1 16.0934 60.0 60.0 36.0 27.0 27.0 3.0 16.0 60.0 360.0 20.58000000000000210.8
sigma 0.5000142700172623.447949723847773 8.7964980485232726.922384111875021 0.50058441171579 3.4478240545776478.7956148886847176.922301652025526 0.0 995.3398569657211 0.4627208309925301 0.42770569708047684 0.19715569036704708 0.08610155981044185 -0.0 -0.0 -0.0 962.7430958537232 916.7386134899587 887.2158475113932 -0.0 -0.0 -0.0 10.973128209713666 0.185792011865734962.5621512917896463 7.464516971789659 -0.0 -0.0 -0.0 10.039673953091574 3.69893623033404945.704865769828319 6.133862539123368 5.805532863642112 3.1234084426128437 6.15223536610881 0.0 4.073860627017756 5.704865769828319 106.3500000314393 1.81511871115241541.614697905241178
zeros 0 0 0 730 0 0 0 730 17455 0 8758 8758 0 0 0 0 0 0 0 0 0 0 0 268 0 501 0 0 0 0 269 0 0 0 0 0 0 0 14980 0 0 0 2768
missing0 0 0 0 0 0 0 0 65 10780 375 375 14682 16535 17520 17520 17520 9103 14683 16535 17520 17520 17520 67 67 15660 360 17520 17520 17520 67 412 14980 14980 16477 16477 17181 17181 17433 17433 17504 17504 17518 17518 0 14980 14980 9382 14381 1283
0 2013.0 1.0 1.0 0.0 2013.0 1.0 1.0 5.0 0.0 2895.6000000000004 1.0 0.9 1.0 nan nan nan nan 2895.5999 3352.8 nan nan nan nan -5.0 0.5447000000000001 nan 1013.0917000000001 nan nan nan 3.3000000000000003 16.0934 nan nan nan nan nan nan 0.0 nan nan nan 2.57
1 2013.0 1.0 1.0 1.0 2013.0 1.0 1.0 6.0 0.0 3048.0 1.0 1.0 nan nan nan nan nan 3048.0 nan nan nan nan nan -4.4 0.5463 nan 1012.0759 nan nan nan 3.9000000000000004 16.0934 nan nan nan nan nan nan 0.0 nan 260.0 9.77 4.63
2 2013.0 1.0 1.0 2.0 2013.0 1.0 1.0 7.0 0.0 1828.8000000000002 1.0 1.0 nan nan nan nan nan 1828.7999 nan nan nan nan nan -3.30000000000000030.619 nan 1012.4145000000001 nan nan nan 3.3000000000000003 16.0934 nan nan nan nan nan nan 0.0 nan nan 7.72 1.54
3 2013.0 1.0 1.0 3.0 2013.0 1.0 1.0 8.0 0.0 1463.0 1.0 1.0 nan nan nan nan nan 1463.04 nan nan nan nan nan -2.80000000000000030.6159 nan 1012.4145000000001 nan nan nan 3.9000000000000004 16.0934 nan nan nan nan nan nan 0.0 nan nan nan 3.09
4 2013.0 1.0 1.0 4.0 2013.0 1.0 1.0 9.0 0.0 1402.1000000000001 1.0 1.0 nan nan nan nan nan 1402.0800000000002 nan nan nan nan nan -2.80000000000000030.6159 nan 1012.7531 nan nan nan 3.9000000000000004 16.0934 nan nan nan nan nan nan 0.0 nan 260.0 nan 4.12
5 2013.0 1.0 1.0 5.0 2013.0 1.0 1.0 10.0 0.0 1524.0 1.0 1.0 nan nan nan nan nan 1524.0 nan nan nan nan nan -2.80000000000000030.6159 nan 1012.4145000000001 nan nan nan 3.9000000000000004 16.0934 nan nan nan nan nan nan 0.0 nan nan nan 3.09
6 2013.0 1.0 1.0 6.0 2013.0 1.0 1.0 11.0 0.0 1524.0 1.0 1.0 nan nan nan nan nan 1524.0 nan nan nan nan nan -3.30000000000000030.5934 nan 1012.0759 nan nan nan 3.9000000000000004 16.0934 nan nan nan nan nan nan 0.0 nan nan 9.26 3.09
7 2013.0 1.0 1.0 7.0 2013.0 1.0 1.0 12.0 0.0 1524.0 1.0 1.0 nan nan nan nan nan 1524.0 nan nan nan nan nan -3.30000000000000030.5934 nan 1012.4145000000001 nan nan nan 3.9000000000000004 16.0934 nan nan nan nan nan nan 0.0 nan 260.0 9.26 4.63
8 2013.0 1.0 1.0 8.0 2013.0 1.0 1.0 13.0 0.0 1524.0 1.0 1.0 nan nan nan nan nan 1524.0 nan nan nan nan nan -2.80000000000000030.6425000000000001 nan 1012.4145000000001 nan nan nan 3.3000000000000003 16.0934 nan nan nan nan nan nan 0.0 nan 260.0 nan 3.09
9 2013.0 1.0 1.0 9.0 2013.0 1.0 1.0 14.0 0.0 1524.0 1.0 0.9 1.0 nan nan nan nan 1524.0 3657.5999 nan nan nan nan -2.80000000000000030.6159 nan 1012.4145000000001 nan nan nan 3.9000000000000004 16.0934 nan nan nan nan nan nan 0.0 nan nan 9.26 3.09
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# ----------\n", "# 5- Now lets add some weather\n", "# Load weather data\n", "wthr1 = h2o.import_file(path=[mylocate(\"bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv\")])\n", "# Peek at the data\n", "wthr1.describe()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows:17520\n", "Cols:9\n", "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Year Local Month Local Day Local Hour Local Dew Point (C) Humidity Fraction Rain (mm) Temperature (C) WC1
type int int int int real real real real enum
mins 2013.0 1.0 1.0 0.0 -26.7000000000000030.125100000000000020.0 -15.600000000000001
mean 2013.5 6.526027397260274515.7205479452054811.5000000000000044.313046467655992 0.5967363891594567 1.379930107526881712.578909070073914
maxs 2014.0 12.0 31.0 23.0 24.400000000000002 1.0 26.924 36.1
sigma 0.5000142700172623.447949723847773 8.7964980485232726.922384111875021 10.973128209713666 0.185792011865734962.562151291789646310.039673953091574
zeros 0 0 0 730 268 0 501 269
missing0 0 0 0 67 67 15660 67 14980
0 2013.0 1.0 1.0 0.0 -5.0 0.5447000000000001 nan 3.3000000000000003
1 2013.0 1.0 1.0 1.0 -4.4 0.5463 nan 3.9000000000000004
2 2013.0 1.0 1.0 2.0 -3.30000000000000030.619 nan 3.3000000000000003
3 2013.0 1.0 1.0 3.0 -2.80000000000000030.6159 nan 3.9000000000000004
4 2013.0 1.0 1.0 4.0 -2.80000000000000030.6159 nan 3.9000000000000004
5 2013.0 1.0 1.0 5.0 -2.80000000000000030.6159 nan 3.9000000000000004
6 2013.0 1.0 1.0 6.0 -3.30000000000000030.5934 nan 3.9000000000000004
7 2013.0 1.0 1.0 7.0 -3.30000000000000030.5934 nan 3.9000000000000004
8 2013.0 1.0 1.0 8.0 -2.80000000000000030.6425000000000001 nan 3.3000000000000003
9 2013.0 1.0 1.0 9.0 -2.80000000000000030.6159 nan 3.9000000000000004
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Lots of columns in there! Lets plan on converting to time-since-epoch to do\n", "# a 'join' with the bike data, plus gather weather info that might affect\n", "# cyclists - rain, snow, temperature. Alas, drop the \"snow\" column since it's\n", "# all NA's. Also add in dew point and humidity just in case. Slice out just\n", "# the columns of interest and drop the rest.\n", "wthr2 = wthr1[[\"Year Local\",\"Month Local\",\"Day Local\",\"Hour Local\",\"Dew Point (C)\",\"Humidity Fraction\",\"Precipitation One Hour (mm)\",\"Temperature (C)\",\"Weather Code 1/ Description\"]]\n", "\n", "wthr2.set_name(wthr2.names.index(\"Precipitation One Hour (mm)\"), \"Rain (mm)\")\n", "wthr2.set_name(wthr2.names.index(\"Weather Code 1/ Description\"), \"WC1\")\n", "wthr2.describe()\n", "# Much better! " ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Filter down to the weather at Noon\n", "wthr3 = wthr2[ wthr2[\"Hour Local\"]==12 ]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows:730\n", "Cols:11\n", "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Year Local Month Local Day Local Hour Local Dew Point (C) Humidity Fraction Rain (mm) Temperature (C) WC1 msec Days
type int int int int real real real real enum int int
mins 2013.0 1.0 1.0 12.0 -26.7000000000000030.1723 0.0 -13.9 1357070400000.0 15706.0
mean 2013.5 6.52602739726027415.7205479452054812.0 4.230123796423659 0.539728198074278 1.531257142857142914.068775790921595 1388560852602.739716070.5
maxs 2014.0 12.0 31.0 12.0 23.3 1.0 12.446 34.4 1420056000000.0 16435.0
sigma 0.50034281800391723.4502152930681498.8022780270096150.0 11.106296472475226 0.179945027923243272.360642486149058710.398985514891212 18219740080.410755210.87713642466474
zeros 0 0 0 0 14 0 15 7 0 0
missing0 0 0 0 3 3 660 3 620 0 0
0 2013.0 1.0 1.0 12.0 -3.30000000000000030.5934 nan 3.9000000000000004 1357070400000.0 15706.0
1 2013.0 1.0 2.0 12.0 -11.7000000000000010.4806 nan -2.2 1357156800000.0 15707.0
2 2013.0 1.0 3.0 12.0 -10.6000000000000010.5248 nan -2.2 1357243200000.0 15708.0
3 2013.0 1.0 4.0 12.0 -7.2 0.49760000000000004nan 2.2 1357329600000.0 15709.0
4 2013.0 1.0 5.0 12.0 -7.2 0.42600000000000005nan 4.4 1357416000000.0 15710.0
5 2013.0 1.0 6.0 12.0 -1.70000000000000020.6451 nan 4.4 haze 1357502400000.0 15711.0
6 2013.0 1.0 7.0 12.0 -6.10000000000000050.41190000000000004nan 6.1000000000000005 1357588800000.0 15712.0
7 2013.0 1.0 8.0 12.0 -1.70000000000000020.5314 nan 7.2 1357675200000.0 15713.0
8 2013.0 1.0 9.0 12.0 0.6000000000000001 0.56 nan 8.9 haze 1357761600000.0 15714.0
9 2013.0 1.0 10.0 12.0 -6.10000000000000050.3952 nan 6.7 1357848000000.0 15715.0
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Lets now get Days since the epoch... we'll convert year/month/day into Epoch\n", "# time, and then back to Epoch days. Need zero-based month and days, but have\n", "# 1-based.\n", "wthr3[\"msec\"] = h2o.H2OFrame.mktime(year=wthr3[\"Year Local\"], month=wthr3[\"Month Local\"]-1, day=wthr3[\"Day Local\"]-1, hour=wthr3[\"Hour Local\"])\n", "secsPerDay=1000*60*60*24\n", "wthr3[\"Days\"] = (wthr3[\"msec\"]/secsPerDay).floor()\n", "wthr3.describe()\n", "# msec looks sane (numbers like 1.3e12 are in the correct range for msec since\n", "# 1970). Epoch Days matches closely with the epoch day numbers from the\n", "# CitiBike dataset. " ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Lets drop off the extra time columns to make a easy-to-handle dataset.\n", "wthr4 = wthr3.drop(\"Year Local\").drop(\"Month Local\").drop(\"Day Local\").drop(\"Hour Local\").drop(\"msec\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Also, most rain numbers are missing - lets assume those are zero rain days\n", "rain = wthr4[\"Rain (mm)\"]\n", "rain[ rain.isna() ] = 0\n", "wthr4[\"Rain (mm)\"] = rain" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Merge Daily Weather with Bikes-Per-Day\n", "Rows:10131\n", "Cols:10\n", "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Days start station name bikes Month DayOfWeek Dew Point (C) Humidity Fraction Rain (mm) Temperature (C) WC1
type int enum int enum enum real real real real enum
mins 15979.0 1.0 -2.2 0.348500000000000030.0 9.4
mean 15993.953311617806 102.42937518507551 7.60732405488106 0.5564958839206396 0.00819840094758661116.937094067712962
maxs 16009.0 603.0 19.4000000000000020.8718 0.254 26.1
sigma 8.950698111468864 74.05933443246006 6.516386487040385 0.148112010866499330.04489297266255909 4.362687300129602
zeros 0 0 0 0 9804 0
missing0 0 0 0 0 0 0 0 0 8816
0 15979.0 1 Ave & E 15 St 173.0 10 Tue 10.6000000000000010.4315 0.0 23.900000000000002
1 15979.0 1 Ave & E 18 St 118.0 10 Tue 10.6000000000000010.4315 0.0 23.900000000000002
2 15979.0 1 Ave & E 30 St 152.0 10 Tue 10.6000000000000010.4315 0.0 23.900000000000002
3 15979.0 10 Ave & W 28 St 115.0 10 Tue 10.6000000000000010.4315 0.0 23.900000000000002
4 15979.0 11 Ave & W 27 St 210.0 10 Tue 10.6000000000000010.4315 0.0 23.900000000000002
5 15979.0 11 Ave & W 41 St 106.0 10 Tue 10.6000000000000010.4315 0.0 23.900000000000002
6 15979.0 12 Ave & W 40 St 144.0 10 Tue 10.6000000000000010.4315 0.0 23.900000000000002
7 15979.0 2 Ave & E 31 St 206.0 10 Tue 10.6000000000000010.4315 0.0 23.900000000000002
8 15979.0 2 Ave & E 58 St 105.0 10 Tue 10.6000000000000010.4315 0.0 23.900000000000002
9 15979.0 3 Ave & Schermerhorn St15.0 10 Tue 10.6000000000000010.4315 0.0 23.900000000000002
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Daysstart station name bikes MonthDayOfWeek Dew Point (C) Humidity Fraction Rain (mm) Temperature (C)WC1
159791 Ave & E 15 St 173 10Tue 10.6 0.4315 0 23.9
159791 Ave & E 18 St 118 10Tue 10.6 0.4315 0 23.9
159791 Ave & E 30 St 152 10Tue 10.6 0.4315 0 23.9
1597910 Ave & W 28 St 115 10Tue 10.6 0.4315 0 23.9
1597911 Ave & W 27 St 210 10Tue 10.6 0.4315 0 23.9
1597911 Ave & W 41 St 106 10Tue 10.6 0.4315 0 23.9
1597912 Ave & W 40 St 144 10Tue 10.6 0.4315 0 23.9
159792 Ave & E 31 St 206 10Tue 10.6 0.4315 0 23.9
159792 Ave & E 58 St 105 10Tue 10.6 0.4315 0 23.9
159793 Ave & Schermerhorn St 15 10Tue 10.6 0.4315 0 23.9
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# ----------\n", "# 6 - Join the weather data-per-day to the bike-starts-per-day\n", "print(\"Merge Daily Weather with Bikes-Per-Day\")\n", "bpd_with_weather = bpd.merge(wthr4,all_x=True,all_y=False)\n", "bpd_with_weather.describe()\n", "bpd_with_weather.show()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training data has 10 columns and 6066 rows, test has 3044 rows, holdout has 1021\n", "gbm Model Build progress: |███████████████████████████████████████████████| 100%\n", "drf Model Build progress: |███████████████████████████████████████████████| 100%\n", "glm Model Build progress: |███████████████████████████████████████████████| 100%\n", "deeplearning Model Build progress: |██████████████████████████████████████| 100%\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Modelmse TRAINmse TESTmse HOLDOUTModel Training Time (s)
GBM0.2159977393.0248269404.25203108.679
DRF804.21520391703.15405621782.08549256.573
GLM620.8814844735.9622856789.78917370.241
DL 213.8582644454.7871732476.59955717.518
" ], "text/plain": [ "Model mse TRAIN mse TEST mse HOLDOUT Model Training Time (s)\n", "------- ----------- ---------- ------------- -------------------------\n", "GBM 0.215998 393.025 404.252 8.679\n", "DRF 804.215 1703.15 1782.09 6.573\n", "GLM 620.881 735.962 789.789 0.241\n", "DL 213.858 454.787 476.6 7.518" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# 7 - Test/Train split again, model build again, this time with weather\n", "split_fit_predict(bpd_with_weather)" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }