{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import h2o\n",
"import time\n",
"from h2o.estimators.glm import H2OGeneralizedLinearEstimator\n",
"from h2o.estimators.gbm import H2OGradientBoostingEstimator\n",
"from h2o.estimators.random_forest import H2ORandomForestEstimator\n",
"from h2o.estimators.deeplearning import H2ODeepLearningEstimator"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Checking whether there is an H2O instance running at http://localhost:54321. connected.\n"
]
},
{
"data": {
"text/html": [
"
H2O cluster uptime: | \n",
"2 mins 11 secs |
\n",
"H2O cluster version: | \n",
"3.11.0.99999 |
\n",
"H2O cluster version age: | \n",
"3 minutes |
\n",
"H2O cluster name: | \n",
"pasha |
\n",
"H2O cluster total nodes: | \n",
"1 |
\n",
"H2O cluster free memory: | \n",
"3.399 Gb |
\n",
"H2O cluster total cores: | \n",
"8 |
\n",
"H2O cluster allowed cores: | \n",
"8 |
\n",
"H2O cluster status: | \n",
"locked, healthy |
\n",
"H2O connection url: | \n",
"http://localhost:54321 |
\n",
"H2O connection proxy: | \n",
"None |
\n",
"Python version: | \n",
"3.5.2 final |
"
],
"text/plain": [
"-------------------------- ----------------------\n",
"H2O cluster uptime: 2 mins 11 secs\n",
"H2O cluster version: 3.11.0.99999\n",
"H2O cluster version age: 3 minutes\n",
"H2O cluster name: pasha\n",
"H2O cluster total nodes: 1\n",
"H2O cluster free memory: 3.399 Gb\n",
"H2O cluster total cores: 8\n",
"H2O cluster allowed cores: 8\n",
"H2O cluster status: locked, healthy\n",
"H2O connection url: http://localhost:54321\n",
"H2O connection proxy:\n",
"Python version: 3.5.2 final\n",
"-------------------------- ----------------------"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Explore a typical Data Science workflow with H2O and Python\n",
"#\n",
"# Goal: assist the manager of CitiBike of NYC to load-balance the bicycles\n",
"# across the CitiBike network of stations, by predicting the number of bike\n",
"# trips taken from the station every day. Use 10 million rows of historical\n",
"# data, and eventually add weather data.\n",
"\n",
"\n",
"# Connect to a cluster\n",
"h2o.init()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.\n",
"\n",
"# Set this to True if you want to fetch the data directly from S3.\n",
"# This is useful if your cluster is running in EC2.\n",
"data_source_is_s3 = False\n",
"\n",
"def mylocate(s):\n",
" if data_source_is_s3:\n",
" return \"s3n://h2o-public-test-data/\" + s\n",
" else:\n",
" return _locate(s)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Import and Parse bike data\n",
"Parse progress: |█████████████████████████████████████████████████████████| 100%\n"
]
}
],
"source": [
"# Pick either the big or the small demo.\n",
"# Big data is 10M rows\n",
"small_test = [mylocate(\"bigdata/laptop/citibike-nyc/2013-10.csv\")]\n",
"big_test = [mylocate(\"bigdata/laptop/citibike-nyc/2013-07.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2013-08.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2013-09.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2013-10.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2013-11.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2013-12.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-01.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-02.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-03.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-04.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-05.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-06.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-07.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-08.csv\")]\n",
"\n",
"# ----------\n",
"\n",
"# 1- Load data - 1 row per bicycle trip. Has columns showing the start and end\n",
"# station, trip duration and trip start time and day. The larger dataset\n",
"# totals about 10 million rows\n",
"print(\"Import and Parse bike data\")\n",
"data = h2o.import_file(path=big_test)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Rows:10407546\n",
"Cols:16\n",
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
" | tripduration | starttime | stoptime | start station id | start station name | start station latitude | start station longitude | end station id | end station name | end station latitude | end station longitude | bikeid | usertype | birth year | gender | Days |
\n",
"\n",
"\n",
"type | int | time | time | int | enum | real | real | int | enum | real | real | int | enum | int | int | int |
\n",
"mins | 60.0 | 1372636800000.0 | 1372637042000.0 | 72.0 | | 40.680342423 | -74.01713445 | 72.0 | | 40.680342423 | -74.01713445 | 14529.0 | | 1899.0 | 0.0 | 15887.0 |
\n",
"mean | 868.9687260570365 | 1390974078989.41 | 1390974947925.281 | 444.85883540654095 | | 40.734381982315185 | -73.99105701820217 | 445.2597855440662 | | 40.73408688953741 | -73.99117077985979 | 17895.66183584484 | | 1975.7989239404546 | 1.0841465413652758 | 16098.629260922817 |
\n",
"maxs | 6250750.0 | 1409529587000.0 | 1409538405000.0 | 3002.0 | | 40.771522000000004 | -73.9500479759 | 3002.0 | | 40.771522000000004 | -73.9500479759 | 21689.0 | | 1998.0 | 2.0 | 16313.0 |
\n",
"sigma | 2985.105405320145 | 11806736501.937712 | 11806714056.539324 | 355.7559897645294 | | 0.01971005087361252 | 0.012345332018503775 | 360.0703808439832 | | 0.019730957863268683 | 0.012431186159808448 | 1938.8051788415307 | | 11.132784904986751 | 0.5630197777940005 | 136.6534484381553 |
\n",
"zeros | 0 | 0 | 0 | 0 | | 0 | 0 | 0 | | 0 | 0 | 0 | | 0 | 1248517 | 0 |
\n",
"missing | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1247644 | 0 | 0 |
\n",
"0 | 634.0 | 2013-07-01 00:00:00 | 2013-07-01 00:10:34 | 164.0 | E 47 St & 2 Ave | 40.75323098 | -73.97032517 | 504.0 | 1 Ave & E 15 St | 40.732218530000004 | -73.98165557 | 16950.0 | Customer | nan | 0.0 | 15887.0 |
\n",
"1 | 1547.0 | 2013-07-01 00:00:02 | 2013-07-01 00:25:49 | 388.0 | W 26 St & 10 Ave | 40.749717753000006 | -74.002950346 | 459.0 | W 20 St & 11 Ave | 40.746745 | -74.007756 | 19816.0 | Customer | nan | 0.0 | 15887.0 |
\n",
"2 | 178.0 | 2013-07-01 00:01:04 | 2013-07-01 00:04:02 | 293.0 | Lafayette St & E 8 St | 40.73028666 | -73.9907647 | 237.0 | E 11 St & 2 Ave | 40.730473090000004 | -73.98672378 | 14548.0 | Subscriber | 1980.0 | 2.0 | 15887.0 |
\n",
"3 | 1580.0 | 2013-07-01 00:01:06 | 2013-07-01 00:27:26 | 531.0 | Forsyth St & Broome St | 40.71893904 | -73.99266288 | 499.0 | Broadway & W 60 St | 40.76915505 | -73.98191841 | 16063.0 | Customer | nan | 0.0 | 15887.0 |
\n",
"4 | 757.0 | 2013-07-01 00:01:10 | 2013-07-01 00:13:47 | 382.0 | University Pl & E 14 St | 40.73492695 | -73.99200509 | 410.0 | Suffolk St & Stanton St | 40.72066442 | -73.98517977 | 19213.0 | Subscriber | 1986.0 | 1.0 | 15887.0 |
\n",
"5 | 861.0 | 2013-07-01 00:01:23 | 2013-07-01 00:15:44 | 511.0 | E 14 St & Avenue B | 40.72938685 | -73.97772429 | 454.0 | E 51 St & 1 Ave | 40.75455731 | -73.96592976000001 | 16223.0 | Subscriber | 1988.0 | 1.0 | 15887.0 |
\n",
"6 | 550.0 | 2013-07-01 00:01:59 | 2013-07-01 00:11:09 | 293.0 | Lafayette St & E 8 St | 40.73028666 | -73.9907647 | 394.0 | E 9 St & Avenue C | 40.72521311 | -73.97768752 | 16746.0 | Customer | nan | 0.0 | 15887.0 |
\n",
"7 | 288.0 | 2013-07-01 00:02:16 | 2013-07-01 00:07:04 | 224.0 | Spruce St & Nassau St | 40.71146364 | -74.00552427 | 376.0 | John St & William St | 40.70862144 | -74.00722156 | 16062.0 | Subscriber | 1985.0 | 2.0 | 15887.0 |
\n",
"8 | 766.0 | 2013-07-01 00:02:16 | 2013-07-01 00:15:02 | 432.0 | E 7 St & Avenue A | 40.72621788 | -73.98379855 | 336.0 | Sullivan St & Washington Sq | 40.730477470000004 | -73.99906065 | 17963.0 | Subscriber | 1980.0 | 2.0 | 15887.0 |
\n",
"9 | 773.0 | 2013-07-01 00:02:23 | 2013-07-01 00:15:16 | 173.0 | Broadway & W 49 St | 40.76064679 | -73.98442659 | 479.0 | 9 Ave & W 45 St | 40.760192520000004 | -73.9912551 | 19365.0 | Subscriber | 1989.0 | 1.0 | 15887.0 |
\n",
"\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# ----------\n",
"\n",
"# 2- light data munging: group the bike starts per-day, converting the 10M rows\n",
"# of trips to about 140,000 station&day combos - predicting the number of trip\n",
"# starts per-station-per-day.\n",
"\n",
"# Convert start time to: Day since the Epoch\n",
"startime = data[\"starttime\"]\n",
"secsPerDay = 1000 * 3600 * 24\n",
"data[\"Days\"] = (startime.asnumeric() / secsPerDay).floor()\n",
"data.describe()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
" Days | start station name | bikes |
\n",
"\n",
"\n",
" 15887 | 1 Ave & E 15 St | 74 |
\n",
" 15887 | 1 Ave & E 18 St | 51 |
\n",
" 15887 | 1 Ave & E 30 St | 66 |
\n",
" 15887 | 1 Ave & E 44 St | 56 |
\n",
" 15887 | 10 Ave & W 28 St | 51 |
\n",
" 15887 | 11 Ave & W 27 St | 65 |
\n",
" 15887 | 11 Ave & W 41 St | 53 |
\n",
" 15887 | 12 Ave & W 40 St | 36 |
\n",
" 15887 | 2 Ave & E 31 St | 96 |
\n",
" 15887 | 2 Ave & E 58 St | 103 |
\n",
"\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Rows:138795\n",
"Cols:3\n",
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
" | Days | start station name | bikes |
\n",
"\n",
"\n",
"type | int | enum | int |
\n",
"mins | 15887.0 | | 1.0 |
\n",
"mean | 16099.488166000216 | | 74.98502107424619 |
\n",
"maxs | 16313.0 | | 668.0 |
\n",
"sigma | 123.39632568805678 | | 64.73186265524505 |
\n",
"zeros | 0 | | 0 |
\n",
"missing | 0 | 0 | 0 |
\n",
"0 | 15887.0 | 1 Ave & E 15 St | 74.0 |
\n",
"1 | 15887.0 | 1 Ave & E 18 St | 51.0 |
\n",
"2 | 15887.0 | 1 Ave & E 30 St | 66.0 |
\n",
"3 | 15887.0 | 1 Ave & E 44 St | 56.0 |
\n",
"4 | 15887.0 | 10 Ave & W 28 St | 51.0 |
\n",
"5 | 15887.0 | 11 Ave & W 27 St | 65.0 |
\n",
"6 | 15887.0 | 11 Ave & W 41 St | 53.0 |
\n",
"7 | 15887.0 | 12 Ave & W 40 St | 36.0 |
\n",
"8 | 15887.0 | 2 Ave & E 31 St | 96.0 |
\n",
"9 | 15887.0 | 2 Ave & E 58 St | 103.0 |
\n",
"\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"[138795, 3]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Now do a monster Group-By. Count bike starts per-station per-day. Ends up\n",
"# with about 340 stations times 400 days (140,000 rows). This is what we want\n",
"# to predict.\n",
"grouped = data.group_by([\"Days\",\"start station name\"])\n",
"bpd = grouped.count().get_frame() # Compute bikes-per-day\n",
"bpd.set_name(2,\"bikes\")\n",
"bpd.show()\n",
"bpd.describe()\n",
"bpd.dim"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quantiles of bikes-per-day\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
" Probs | bikesQuantiles |
\n",
"\n",
"\n",
" 0.01 | 2 |
\n",
" 0.1 | 11 |
\n",
" 0.25 | 26 |
\n",
" 0.333 | 35 |
\n",
" 0.5 | 59 |
\n",
" 0.667 | 89 |
\n",
" 0.75 | 107 |
\n",
" 0.9 | 158 |
\n",
" 0.99 | 293 |
\n",
"\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Quantiles: the data is fairly unbalanced; some station/day combos are wildly\n",
"# more popular than others.\n",
"print(\"Quantiles of bikes-per-day\")\n",
"bpd[\"bikes\"].quantile().show()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Bikes-Per-Day\n",
"Rows:138795\n",
"Cols:5\n",
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
" | Days | start station name | bikes | Month | DayOfWeek |
\n",
"\n",
"\n",
"type | int | enum | int | enum | enum |
\n",
"mins | 15887.0 | | 1.0 | | |
\n",
"mean | 16099.488166000216 | | 74.98502107424619 | | |
\n",
"maxs | 16313.0 | | 668.0 | | |
\n",
"sigma | 123.39632568805678 | | 64.73186265524505 | | |
\n",
"zeros | 0 | | 0 | | |
\n",
"missing | 0 | 0 | 0 | 0 | 0 |
\n",
"0 | 15887.0 | 1 Ave & E 15 St | 74.0 | 7 | Mon |
\n",
"1 | 15887.0 | 1 Ave & E 18 St | 51.0 | 7 | Mon |
\n",
"2 | 15887.0 | 1 Ave & E 30 St | 66.0 | 7 | Mon |
\n",
"3 | 15887.0 | 1 Ave & E 44 St | 56.0 | 7 | Mon |
\n",
"4 | 15887.0 | 10 Ave & W 28 St | 51.0 | 7 | Mon |
\n",
"5 | 15887.0 | 11 Ave & W 27 St | 65.0 | 7 | Mon |
\n",
"6 | 15887.0 | 11 Ave & W 41 St | 53.0 | 7 | Mon |
\n",
"7 | 15887.0 | 12 Ave & W 40 St | 36.0 | 7 | Mon |
\n",
"8 | 15887.0 | 2 Ave & E 31 St | 96.0 | 7 | Mon |
\n",
"9 | 15887.0 | 2 Ave & E 58 St | 103.0 | 7 | Mon |
\n",
"\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# A little feature engineering\n",
"# Add in month-of-year (seasonality; fewer bike rides in winter than summer)\n",
"secs = bpd[\"Days\"]*secsPerDay\n",
"bpd[\"Month\"] = secs.month().asfactor()\n",
"# Add in day-of-week (work-week; more bike rides on Sunday than Monday)\n",
"bpd[\"DayOfWeek\"] = secs.dayOfWeek()\n",
"print(\"Bikes-Per-Day\")\n",
"bpd.describe()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# ----------\n",
"# 3- Fit a model on train; using test as validation\n",
"\n",
"# Function for doing class test/train/holdout split\n",
"def split_fit_predict(data):\n",
" global gbm0,drf0,glm0,dl0\n",
" # Classic Test/Train split\n",
" r = data['Days'].runif() # Random UNIForm numbers, one per row\n",
" train = data[ r < 0.6]\n",
" test = data[(0.6 <= r) & (r < 0.9)]\n",
" hold = data[ 0.9 <= r ]\n",
" print(\"Training data has\",train.ncol,\"columns and\",train.nrow,\"rows, test has\",test.nrow,\"rows, holdout has\",hold.nrow)\n",
" bike_names_x = data.names\n",
" bike_names_x.remove(\"bikes\")\n",
" \n",
" # Run GBM\n",
" s = time.time()\n",
" \n",
" gbm0 = H2OGradientBoostingEstimator(ntrees=500, # 500 works well\n",
" max_depth=6,\n",
" learn_rate=0.1)\n",
" \n",
"\n",
" gbm0.train(x =bike_names_x,\n",
" y =\"bikes\",\n",
" training_frame =train,\n",
" validation_frame=test)\n",
"\n",
" gbm_elapsed = time.time() - s\n",
"\n",
" # Run DRF\n",
" s = time.time()\n",
" \n",
" drf0 = H2ORandomForestEstimator(ntrees=250, max_depth=30)\n",
"\n",
" drf0.train(x =bike_names_x,\n",
" y =\"bikes\",\n",
" training_frame =train,\n",
" validation_frame=test)\n",
" \n",
" drf_elapsed = time.time() - s \n",
" \n",
" \n",
" # Run GLM\n",
" if \"WC1\" in bike_names_x: bike_names_x.remove(\"WC1\")\n",
" s = time.time()\n",
"\n",
" glm0 = H2OGeneralizedLinearEstimator(Lambda=[1e-5], family=\"poisson\")\n",
" \n",
" glm0.train(x =bike_names_x,\n",
" y =\"bikes\",\n",
" training_frame =train,\n",
" validation_frame=test)\n",
"\n",
" glm_elapsed = time.time() - s\n",
" \n",
" # Run DL\n",
" s = time.time()\n",
"\n",
" dl0 = H2ODeepLearningEstimator(hidden=[50,50,50,50], epochs=50)\n",
" \n",
" dl0.train(x =bike_names_x,\n",
" y =\"bikes\",\n",
" training_frame =train,\n",
" validation_frame=test)\n",
" \n",
" dl_elapsed = time.time() - s\n",
" \n",
" # ----------\n",
" # 4- Score on holdout set & report\n",
" train_mse_gbm = gbm0.model_performance(train).mse()\n",
" test_mse_gbm = gbm0.model_performance(test ).mse()\n",
" hold_mse_gbm = gbm0.model_performance(hold ).mse()\n",
"# print \"GBM mse TRAIN=\",train_mse_gbm,\", mse TEST=\",test_mse_gbm,\", mse HOLDOUT=\",hold_mse_gbm\n",
" \n",
" train_mse_drf = drf0.model_performance(train).mse()\n",
" test_mse_drf = drf0.model_performance(test ).mse()\n",
" hold_mse_drf = drf0.model_performance(hold ).mse()\n",
"# print \"DRF mse TRAIN=\",train_mse_drf,\", mse TEST=\",test_mse_drf,\", mse HOLDOUT=\",hold_mse_drf\n",
" \n",
" train_mse_glm = glm0.model_performance(train).mse()\n",
" test_mse_glm = glm0.model_performance(test ).mse()\n",
" hold_mse_glm = glm0.model_performance(hold ).mse()\n",
"# print \"GLM mse TRAIN=\",train_mse_glm,\", mse TEST=\",test_mse_glm,\", mse HOLDOUT=\",hold_mse_glm\n",
" \n",
" train_mse_dl = dl0.model_performance(train).mse()\n",
" test_mse_dl = dl0.model_performance(test ).mse()\n",
" hold_mse_dl = dl0.model_performance(hold ).mse()\n",
"# print \" DL mse TRAIN=\",train_mse_dl,\", mse TEST=\",test_mse_dl,\", mse HOLDOUT=\",hold_mse_dl\n",
" \n",
" # make a pretty HTML table printout of the results\n",
"\n",
" header = [\"Model\", \"mse TRAIN\", \"mse TEST\", \"mse HOLDOUT\", \"Model Training Time (s)\"]\n",
" table = [\n",
" [\"GBM\", train_mse_gbm, test_mse_gbm, hold_mse_gbm, round(gbm_elapsed,3)],\n",
" [\"DRF\", train_mse_drf, test_mse_drf, hold_mse_drf, round(drf_elapsed,3)],\n",
" [\"GLM\", train_mse_glm, test_mse_glm, hold_mse_glm, round(glm_elapsed,3)],\n",
" [\"DL \", train_mse_dl, test_mse_dl, hold_mse_dl , round(dl_elapsed,3) ],\n",
" ]\n",
" h2o.display.H2ODisplay(table,header)\n",
" # --------------"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training data has 5 columns and 83197 rows, test has 41792 rows, holdout has 13806\n",
"gbm Model Build progress: |███████████████████████████████████████████████| 100%\n",
"drf Model Build progress: |███████████████████████████████████████████████| 100%\n",
"glm Model Build progress: |███████████████████████████████████████████████| 100%\n",
"deeplearning Model Build progress: |██████████████████████████████████████| 100%\n"
]
},
{
"data": {
"text/html": [
"Model | \n",
"mse TRAIN | \n",
"mse TEST | \n",
"mse HOLDOUT | \n",
"Model Training Time (s) |
\n",
"GBM | \n",
"144.1978206 | \n",
"337.7233790 | \n",
"324.8676731 | \n",
"12.454 |
\n",
"DRF | \n",
"599.8996508 | \n",
"723.9889710 | \n",
"685.0177303 | \n",
"19.764 |
\n",
"GLM | \n",
"967.1714929 | \n",
"957.3634230 | \n",
"938.1439730 | \n",
"0.454 |
\n",
"DL | \n",
"529.4687702 | \n",
"622.4935714 | \n",
"607.3274025 | \n",
"59.935 |
"
],
"text/plain": [
"Model mse TRAIN mse TEST mse HOLDOUT Model Training Time (s)\n",
"------- ----------- ---------- ------------- -------------------------\n",
"GBM 144.198 337.723 324.868 12.454\n",
"DRF 599.9 723.989 685.018 19.764\n",
"GLM 967.171 957.363 938.144 0.454\n",
"DL 529.469 622.494 607.327 59.935"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Split the data (into test & train), fit some models and predict on the holdout data\n",
"split_fit_predict(bpd)\n",
"# Here we see an r^2 of 0.91 for GBM, and 0.71 for GLM. This means given just\n",
"# the station, the month, and the day-of-week we can predict 90% of the\n",
"# variance of the bike-trip-starts."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Parse progress: |█████████████████████████████████████████████████████████| 100%\n",
"Rows:17520\n",
"Cols:50\n",
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
" | Year Local | Month Local | Day Local | Hour Local | Year UTC | Month UTC | Day UTC | Hour UTC | Cavok Reported | Cloud Ceiling (m) | Cloud Cover Fraction | Cloud Cover Fraction 1 | Cloud Cover Fraction 2 | Cloud Cover Fraction 3 | Cloud Cover Fraction 4 | Cloud Cover Fraction 5 | Cloud Cover Fraction 6 | Cloud Height (m) 1 | Cloud Height (m) 2 | Cloud Height (m) 3 | Cloud Height (m) 4 | Cloud Height (m) 5 | Cloud Height (m) 6 | Dew Point (C) | Humidity Fraction | Precipitation One Hour (mm) | Pressure Altimeter (mbar) | Pressure Sea Level (mbar) | Pressure Station (mbar) | Snow Depth (cm) | Temperature (C) | Visibility (km) | Weather Code 1 | Weather Code 1/ Description | Weather Code 2 | Weather Code 2/ Description | Weather Code 3 | Weather Code 3/ Description | Weather Code 4 | Weather Code 4/ Description | Weather Code 5 | Weather Code 5/ Description | Weather Code 6 | Weather Code 6/ Description | Weather Code Most Severe / Icon Code | Weather Code Most Severe | Weather Code Most Severe / Description | Wind Direction (degrees) | Wind Gust (m/s) | Wind Speed (m/s) |
\n",
"\n",
"\n",
"type | int | int | int | int | int | int | int | int | int | real | real | real | real | real | int | int | int | real | real | real | int | int | int | real | real | real | real | int | int | int | real | real | int | enum | int | enum | int | enum | int | enum | int | enum | int | enum | int | int | enum | int | real | real |
\n",
"mins | 2013.0 | 1.0 | 1.0 | 0.0 | 2013.0 | 1.0 | 1.0 | 0.0 | 0.0 | 61.0 | 0.0 | 0.0 | 0.25 | 0.5 | NaN | NaN | NaN | 60.96 | 213.36 | 365.76 | NaN | NaN | NaN | -26.700000000000003 | 0.12510000000000002 | 0.0 | 983.2949000000001 | NaN | NaN | NaN | -15.600000000000001 | 0.001 | 1.0 | | 1.0 | | 1.0 | | 1.0 | | 1.0 | | 3.0 | | 0.0 | 1.0 | | 10.0 | 7.2 | 0.0 |
\n",
"mean | 2013.5 | 6.5260273972602745 | 15.72054794520548 | 11.500000000000004 | 2013.5005707762557 | 6.525114155251141 | 15.72134703196347 | 11.500114155251142 | 0.0 | 1306.3119584569736 | 0.4167424905220181 | 0.3612073490813649 | 0.8724453840732911 | 0.9630456852791879 | 0.0 | 0.0 | 0.0 | 1293.9822681953192 | 1643.7390016566796 | 2084.8938637563456 | 0.0 | 0.0 | 0.0 | 4.313046467655992 | 0.5967363891594567 | 1.3799301075268817 | 1017.8258144055944 | 0.0 | 0.0 | 0.0 | 12.578909070073914 | 14.391442968202009 | 4.84251968503937 | | 3.6586768935762226 | | 2.8466076696165192 | | 2.0114942528735633 | | 4.125 | | 3.0 | | 1.3784817351598173 | 4.84251968503937 | | 194.69525681985743 | 9.422169480726348 | 2.4103288784874057 |
\n",
"maxs | 2014.0 | 12.0 | 31.0 | 23.0 | 2015.0 | 12.0 | 31.0 | 23.0 | 0.0 | 3657.6000000000004 | 1.0 | 1.0 | 1.0 | 1.0 | NaN | NaN | NaN | 3657.5999 | 3657.5999 | 3657.5999 | NaN | NaN | NaN | 24.400000000000002 | 1.0 | 26.924 | 1042.2113 | NaN | NaN | NaN | 36.1 | 16.0934 | 60.0 | | 60.0 | | 36.0 | | 27.0 | | 27.0 | | 3.0 | | 16.0 | 60.0 | | 360.0 | 20.580000000000002 | 10.8 |
\n",
"sigma | 0.500014270017262 | 3.447949723847773 | 8.796498048523272 | 6.922384111875021 | 0.50058441171579 | 3.447824054577647 | 8.795614888684717 | 6.922301652025526 | 0.0 | 995.3398569657211 | 0.4627208309925301 | 0.42770569708047684 | 0.19715569036704708 | 0.08610155981044185 | -0.0 | -0.0 | -0.0 | 962.7430958537232 | 916.7386134899587 | 887.2158475113932 | -0.0 | -0.0 | -0.0 | 10.973128209713666 | 0.18579201186573496 | 2.5621512917896463 | 7.464516971789659 | -0.0 | -0.0 | -0.0 | 10.039673953091574 | 3.6989362303340494 | 5.704865769828319 | | 6.133862539123368 | | 5.805532863642112 | | 3.1234084426128437 | | 6.15223536610881 | | 0.0 | | 4.073860627017756 | 5.704865769828319 | | 106.3500000314393 | 1.8151187111524154 | 1.614697905241178 |
\n",
"zeros | 0 | 0 | 0 | 730 | 0 | 0 | 0 | 730 | 17455 | 0 | 8758 | 8758 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 268 | 0 | 501 | 0 | 0 | 0 | 0 | 269 | 0 | 0 | | 0 | | 0 | | 0 | | 0 | | 0 | | 14980 | 0 | | 0 | 0 | 2768 |
\n",
"missing | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 65 | 10780 | 375 | 375 | 14682 | 16535 | 17520 | 17520 | 17520 | 9103 | 14683 | 16535 | 17520 | 17520 | 17520 | 67 | 67 | 15660 | 360 | 17520 | 17520 | 17520 | 67 | 412 | 14980 | 14980 | 16477 | 16477 | 17181 | 17181 | 17433 | 17433 | 17504 | 17504 | 17518 | 17518 | 0 | 14980 | 14980 | 9382 | 14381 | 1283 |
\n",
"0 | 2013.0 | 1.0 | 1.0 | 0.0 | 2013.0 | 1.0 | 1.0 | 5.0 | 0.0 | 2895.6000000000004 | 1.0 | 0.9 | 1.0 | nan | nan | nan | nan | 2895.5999 | 3352.8 | nan | nan | nan | nan | -5.0 | 0.5447000000000001 | nan | 1013.0917000000001 | nan | nan | nan | 3.3000000000000003 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | nan | nan | 2.57 |
\n",
"1 | 2013.0 | 1.0 | 1.0 | 1.0 | 2013.0 | 1.0 | 1.0 | 6.0 | 0.0 | 3048.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 3048.0 | nan | nan | nan | nan | nan | -4.4 | 0.5463 | nan | 1012.0759 | nan | nan | nan | 3.9000000000000004 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | 260.0 | 9.77 | 4.63 |
\n",
"2 | 2013.0 | 1.0 | 1.0 | 2.0 | 2013.0 | 1.0 | 1.0 | 7.0 | 0.0 | 1828.8000000000002 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1828.7999 | nan | nan | nan | nan | nan | -3.3000000000000003 | 0.619 | nan | 1012.4145000000001 | nan | nan | nan | 3.3000000000000003 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | nan | 7.72 | 1.54 |
\n",
"3 | 2013.0 | 1.0 | 1.0 | 3.0 | 2013.0 | 1.0 | 1.0 | 8.0 | 0.0 | 1463.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1463.04 | nan | nan | nan | nan | nan | -2.8000000000000003 | 0.6159 | nan | 1012.4145000000001 | nan | nan | nan | 3.9000000000000004 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | nan | nan | 3.09 |
\n",
"4 | 2013.0 | 1.0 | 1.0 | 4.0 | 2013.0 | 1.0 | 1.0 | 9.0 | 0.0 | 1402.1000000000001 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1402.0800000000002 | nan | nan | nan | nan | nan | -2.8000000000000003 | 0.6159 | nan | 1012.7531 | nan | nan | nan | 3.9000000000000004 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | 260.0 | nan | 4.12 |
\n",
"5 | 2013.0 | 1.0 | 1.0 | 5.0 | 2013.0 | 1.0 | 1.0 | 10.0 | 0.0 | 1524.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1524.0 | nan | nan | nan | nan | nan | -2.8000000000000003 | 0.6159 | nan | 1012.4145000000001 | nan | nan | nan | 3.9000000000000004 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | nan | nan | 3.09 |
\n",
"6 | 2013.0 | 1.0 | 1.0 | 6.0 | 2013.0 | 1.0 | 1.0 | 11.0 | 0.0 | 1524.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1524.0 | nan | nan | nan | nan | nan | -3.3000000000000003 | 0.5934 | nan | 1012.0759 | nan | nan | nan | 3.9000000000000004 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | nan | 9.26 | 3.09 |
\n",
"7 | 2013.0 | 1.0 | 1.0 | 7.0 | 2013.0 | 1.0 | 1.0 | 12.0 | 0.0 | 1524.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1524.0 | nan | nan | nan | nan | nan | -3.3000000000000003 | 0.5934 | nan | 1012.4145000000001 | nan | nan | nan | 3.9000000000000004 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | 260.0 | 9.26 | 4.63 |
\n",
"8 | 2013.0 | 1.0 | 1.0 | 8.0 | 2013.0 | 1.0 | 1.0 | 13.0 | 0.0 | 1524.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1524.0 | nan | nan | nan | nan | nan | -2.8000000000000003 | 0.6425000000000001 | nan | 1012.4145000000001 | nan | nan | nan | 3.3000000000000003 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | 260.0 | nan | 3.09 |
\n",
"9 | 2013.0 | 1.0 | 1.0 | 9.0 | 2013.0 | 1.0 | 1.0 | 14.0 | 0.0 | 1524.0 | 1.0 | 0.9 | 1.0 | nan | nan | nan | nan | 1524.0 | 3657.5999 | nan | nan | nan | nan | -2.8000000000000003 | 0.6159 | nan | 1012.4145000000001 | nan | nan | nan | 3.9000000000000004 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | nan | 9.26 | 3.09 |
\n",
"\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# ----------\n",
"# 5- Now lets add some weather\n",
"# Load weather data\n",
"wthr1 = h2o.import_file(path=[mylocate(\"bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv\")])\n",
"# Peek at the data\n",
"wthr1.describe()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Rows:17520\n",
"Cols:9\n",
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
" | Year Local | Month Local | Day Local | Hour Local | Dew Point (C) | Humidity Fraction | Rain (mm) | Temperature (C) | WC1 |
\n",
"\n",
"\n",
"type | int | int | int | int | real | real | real | real | enum |
\n",
"mins | 2013.0 | 1.0 | 1.0 | 0.0 | -26.700000000000003 | 0.12510000000000002 | 0.0 | -15.600000000000001 | |
\n",
"mean | 2013.5 | 6.5260273972602745 | 15.72054794520548 | 11.500000000000004 | 4.313046467655992 | 0.5967363891594567 | 1.3799301075268817 | 12.578909070073914 | |
\n",
"maxs | 2014.0 | 12.0 | 31.0 | 23.0 | 24.400000000000002 | 1.0 | 26.924 | 36.1 | |
\n",
"sigma | 0.500014270017262 | 3.447949723847773 | 8.796498048523272 | 6.922384111875021 | 10.973128209713666 | 0.18579201186573496 | 2.5621512917896463 | 10.039673953091574 | |
\n",
"zeros | 0 | 0 | 0 | 730 | 268 | 0 | 501 | 269 | |
\n",
"missing | 0 | 0 | 0 | 0 | 67 | 67 | 15660 | 67 | 14980 |
\n",
"0 | 2013.0 | 1.0 | 1.0 | 0.0 | -5.0 | 0.5447000000000001 | nan | 3.3000000000000003 | |
\n",
"1 | 2013.0 | 1.0 | 1.0 | 1.0 | -4.4 | 0.5463 | nan | 3.9000000000000004 | |
\n",
"2 | 2013.0 | 1.0 | 1.0 | 2.0 | -3.3000000000000003 | 0.619 | nan | 3.3000000000000003 | |
\n",
"3 | 2013.0 | 1.0 | 1.0 | 3.0 | -2.8000000000000003 | 0.6159 | nan | 3.9000000000000004 | |
\n",
"4 | 2013.0 | 1.0 | 1.0 | 4.0 | -2.8000000000000003 | 0.6159 | nan | 3.9000000000000004 | |
\n",
"5 | 2013.0 | 1.0 | 1.0 | 5.0 | -2.8000000000000003 | 0.6159 | nan | 3.9000000000000004 | |
\n",
"6 | 2013.0 | 1.0 | 1.0 | 6.0 | -3.3000000000000003 | 0.5934 | nan | 3.9000000000000004 | |
\n",
"7 | 2013.0 | 1.0 | 1.0 | 7.0 | -3.3000000000000003 | 0.5934 | nan | 3.9000000000000004 | |
\n",
"8 | 2013.0 | 1.0 | 1.0 | 8.0 | -2.8000000000000003 | 0.6425000000000001 | nan | 3.3000000000000003 | |
\n",
"9 | 2013.0 | 1.0 | 1.0 | 9.0 | -2.8000000000000003 | 0.6159 | nan | 3.9000000000000004 | |
\n",
"\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Lots of columns in there! Lets plan on converting to time-since-epoch to do\n",
"# a 'join' with the bike data, plus gather weather info that might affect\n",
"# cyclists - rain, snow, temperature. Alas, drop the \"snow\" column since it's\n",
"# all NA's. Also add in dew point and humidity just in case. Slice out just\n",
"# the columns of interest and drop the rest.\n",
"wthr2 = wthr1[[\"Year Local\",\"Month Local\",\"Day Local\",\"Hour Local\",\"Dew Point (C)\",\"Humidity Fraction\",\"Precipitation One Hour (mm)\",\"Temperature (C)\",\"Weather Code 1/ Description\"]]\n",
"\n",
"wthr2.set_name(wthr2.names.index(\"Precipitation One Hour (mm)\"), \"Rain (mm)\")\n",
"wthr2.set_name(wthr2.names.index(\"Weather Code 1/ Description\"), \"WC1\")\n",
"wthr2.describe()\n",
"# Much better! "
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Filter down to the weather at Noon\n",
"wthr3 = wthr2[ wthr2[\"Hour Local\"]==12 ]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Rows:730\n",
"Cols:11\n",
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
" | Year Local | Month Local | Day Local | Hour Local | Dew Point (C) | Humidity Fraction | Rain (mm) | Temperature (C) | WC1 | msec | Days |
\n",
"\n",
"\n",
"type | int | int | int | int | real | real | real | real | enum | int | int |
\n",
"mins | 2013.0 | 1.0 | 1.0 | 12.0 | -26.700000000000003 | 0.1723 | 0.0 | -13.9 | | 1357070400000.0 | 15706.0 |
\n",
"mean | 2013.5 | 6.526027397260274 | 15.72054794520548 | 12.0 | 4.230123796423659 | 0.539728198074278 | 1.5312571428571429 | 14.068775790921595 | | 1388560852602.7397 | 16070.5 |
\n",
"maxs | 2014.0 | 12.0 | 31.0 | 12.0 | 23.3 | 1.0 | 12.446 | 34.4 | | 1420056000000.0 | 16435.0 |
\n",
"sigma | 0.5003428180039172 | 3.450215293068149 | 8.802278027009615 | 0.0 | 11.106296472475226 | 0.17994502792324327 | 2.3606424861490587 | 10.398985514891212 | | 18219740080.410755 | 210.87713642466474 |
\n",
"zeros | 0 | 0 | 0 | 0 | 14 | 0 | 15 | 7 | | 0 | 0 |
\n",
"missing | 0 | 0 | 0 | 0 | 3 | 3 | 660 | 3 | 620 | 0 | 0 |
\n",
"0 | 2013.0 | 1.0 | 1.0 | 12.0 | -3.3000000000000003 | 0.5934 | nan | 3.9000000000000004 | | 1357070400000.0 | 15706.0 |
\n",
"1 | 2013.0 | 1.0 | 2.0 | 12.0 | -11.700000000000001 | 0.4806 | nan | -2.2 | | 1357156800000.0 | 15707.0 |
\n",
"2 | 2013.0 | 1.0 | 3.0 | 12.0 | -10.600000000000001 | 0.5248 | nan | -2.2 | | 1357243200000.0 | 15708.0 |
\n",
"3 | 2013.0 | 1.0 | 4.0 | 12.0 | -7.2 | 0.49760000000000004 | nan | 2.2 | | 1357329600000.0 | 15709.0 |
\n",
"4 | 2013.0 | 1.0 | 5.0 | 12.0 | -7.2 | 0.42600000000000005 | nan | 4.4 | | 1357416000000.0 | 15710.0 |
\n",
"5 | 2013.0 | 1.0 | 6.0 | 12.0 | -1.7000000000000002 | 0.6451 | nan | 4.4 | haze | 1357502400000.0 | 15711.0 |
\n",
"6 | 2013.0 | 1.0 | 7.0 | 12.0 | -6.1000000000000005 | 0.41190000000000004 | nan | 6.1000000000000005 | | 1357588800000.0 | 15712.0 |
\n",
"7 | 2013.0 | 1.0 | 8.0 | 12.0 | -1.7000000000000002 | 0.5314 | nan | 7.2 | | 1357675200000.0 | 15713.0 |
\n",
"8 | 2013.0 | 1.0 | 9.0 | 12.0 | 0.6000000000000001 | 0.56 | nan | 8.9 | haze | 1357761600000.0 | 15714.0 |
\n",
"9 | 2013.0 | 1.0 | 10.0 | 12.0 | -6.1000000000000005 | 0.3952 | nan | 6.7 | | 1357848000000.0 | 15715.0 |
\n",
"\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Lets now get Days since the epoch... we'll convert year/month/day into Epoch\n",
"# time, and then back to Epoch days. Need zero-based month and days, but have\n",
"# 1-based.\n",
"wthr3[\"msec\"] = h2o.H2OFrame.mktime(year=wthr3[\"Year Local\"], month=wthr3[\"Month Local\"]-1, day=wthr3[\"Day Local\"]-1, hour=wthr3[\"Hour Local\"])\n",
"secsPerDay=1000*60*60*24\n",
"wthr3[\"Days\"] = (wthr3[\"msec\"]/secsPerDay).floor()\n",
"wthr3.describe()\n",
"# msec looks sane (numbers like 1.3e12 are in the correct range for msec since\n",
"# 1970). Epoch Days matches closely with the epoch day numbers from the\n",
"# CitiBike dataset. "
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Lets drop off the extra time columns to make a easy-to-handle dataset.\n",
"wthr4 = wthr3.drop(\"Year Local\").drop(\"Month Local\").drop(\"Day Local\").drop(\"Hour Local\").drop(\"msec\")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Also, most rain numbers are missing - lets assume those are zero rain days\n",
"rain = wthr4[\"Rain (mm)\"]\n",
"rain[ rain.isna() ] = 0\n",
"wthr4[\"Rain (mm)\"] = rain"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Merge Daily Weather with Bikes-Per-Day\n",
"Rows:138795\n",
"Cols:10\n",
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
" | Days | start station name | bikes | Month | DayOfWeek | Dew Point (C) | Humidity Fraction | Rain (mm) | Temperature (C) | WC1 |
\n",
"\n",
"\n",
"type | int | enum | int | enum | enum | real | real | real | real | enum |
\n",
"mins | 15887.0 | | 1.0 | | | -26.700000000000003 | 0.1723 | 0.0 | -13.9 | |
\n",
"mean | 16099.488166000216 | | 74.98502107424619 | | | 5.451762870514826 | 0.5321604665675 | 0.08503267408768331 | 15.614519464499507 | |
\n",
"maxs | 16313.0 | | 668.0 | | | 23.3 | 1.0 | 8.382 | 34.4 | |
\n",
"sigma | 123.39632568805678 | | 64.73186265524505 | | | 11.723905010415397 | 0.1784104767702021 | 0.5764961942157182 | 10.928653577314824 | |
\n",
"zeros | 0 | | 0 | | | 1956 | 0 | 130793 | 1567 | |
\n",
"missing | 0 | 0 | 0 | 0 | 0 | 980 | 980 | 0 | 980 | 118772 |
\n",
"0 | 15887.0 | 1 Ave & E 15 St | 74.0 | 7 | Mon | 21.700000000000003 | 0.9354 | 4.572 | 22.8 | rain |
\n",
"1 | 15887.0 | 1 Ave & E 18 St | 51.0 | 7 | Mon | 21.700000000000003 | 0.9354 | 4.572 | 22.8 | rain |
\n",
"2 | 15887.0 | 1 Ave & E 30 St | 66.0 | 7 | Mon | 21.700000000000003 | 0.9354 | 4.572 | 22.8 | rain |
\n",
"3 | 15887.0 | 1 Ave & E 44 St | 56.0 | 7 | Mon | 21.700000000000003 | 0.9354 | 4.572 | 22.8 | rain |
\n",
"4 | 15887.0 | 10 Ave & W 28 St | 51.0 | 7 | Mon | 21.700000000000003 | 0.9354 | 4.572 | 22.8 | rain |
\n",
"5 | 15887.0 | 11 Ave & W 27 St | 65.0 | 7 | Mon | 21.700000000000003 | 0.9354 | 4.572 | 22.8 | rain |
\n",
"6 | 15887.0 | 11 Ave & W 41 St | 53.0 | 7 | Mon | 21.700000000000003 | 0.9354 | 4.572 | 22.8 | rain |
\n",
"7 | 15887.0 | 12 Ave & W 40 St | 36.0 | 7 | Mon | 21.700000000000003 | 0.9354 | 4.572 | 22.8 | rain |
\n",
"8 | 15887.0 | 2 Ave & E 31 St | 96.0 | 7 | Mon | 21.700000000000003 | 0.9354 | 4.572 | 22.8 | rain |
\n",
"9 | 15887.0 | 2 Ave & E 58 St | 103.0 | 7 | Mon | 21.700000000000003 | 0.9354 | 4.572 | 22.8 | rain |
\n",
"\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
" Days | start station name | bikes | Month | DayOfWeek | Dew Point (C) | Humidity Fraction | Rain (mm) | Temperature (C) | WC1 |
\n",
"\n",
"\n",
" 15887 | 1 Ave & E 15 St | 74 | 7 | Mon | 21.7 | 0.9354 | 4.572 | 22.8 | rain |
\n",
" 15887 | 1 Ave & E 18 St | 51 | 7 | Mon | 21.7 | 0.9354 | 4.572 | 22.8 | rain |
\n",
" 15887 | 1 Ave & E 30 St | 66 | 7 | Mon | 21.7 | 0.9354 | 4.572 | 22.8 | rain |
\n",
" 15887 | 1 Ave & E 44 St | 56 | 7 | Mon | 21.7 | 0.9354 | 4.572 | 22.8 | rain |
\n",
" 15887 | 10 Ave & W 28 St | 51 | 7 | Mon | 21.7 | 0.9354 | 4.572 | 22.8 | rain |
\n",
" 15887 | 11 Ave & W 27 St | 65 | 7 | Mon | 21.7 | 0.9354 | 4.572 | 22.8 | rain |
\n",
" 15887 | 11 Ave & W 41 St | 53 | 7 | Mon | 21.7 | 0.9354 | 4.572 | 22.8 | rain |
\n",
" 15887 | 12 Ave & W 40 St | 36 | 7 | Mon | 21.7 | 0.9354 | 4.572 | 22.8 | rain |
\n",
" 15887 | 2 Ave & E 31 St | 96 | 7 | Mon | 21.7 | 0.9354 | 4.572 | 22.8 | rain |
\n",
" 15887 | 2 Ave & E 58 St | 103 | 7 | Mon | 21.7 | 0.9354 | 4.572 | 22.8 | rain |
\n",
"\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# ----------\n",
"# 6 - Join the weather data-per-day to the bike-starts-per-day\n",
"print(\"Merge Daily Weather with Bikes-Per-Day\")\n",
"bpd_with_weather = bpd.merge(wthr4,all_x=True,all_y=False)\n",
"bpd_with_weather.describe()\n",
"bpd_with_weather.show()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training data has 10 columns and 83276 rows, test has 41631 rows, holdout has 13888\n",
"gbm Model Build progress: |███████████████████████████████████████████████| 100%\n",
"drf Model Build progress: |███████████████████████████████████████████████| 100%\n",
"glm Model Build progress: |███████████████████████████████████████████████| 100%\n",
"deeplearning Model Build progress: |██████████████████████████████████████| 100%\n"
]
},
{
"data": {
"text/html": [
"Model | \n",
"mse TRAIN | \n",
"mse TEST | \n",
"mse HOLDOUT | \n",
"Model Training Time (s) |
\n",
"GBM | \n",
"120.3078530 | \n",
"291.8993667 | \n",
"285.0585033 | \n",
"19.141 |
\n",
"DRF | \n",
"117.6329195 | \n",
"388.7562018 | \n",
"385.6955657 | \n",
"88.24 |
\n",
"GLM | \n",
"862.8161359 | \n",
"879.2555091 | \n",
"875.8816961 | \n",
"0.449 |
\n",
"DL | \n",
"238.1676790 | \n",
"340.7971229 | \n",
"351.0403636 | \n",
"63.611 |
"
],
"text/plain": [
"Model mse TRAIN mse TEST mse HOLDOUT Model Training Time (s)\n",
"------- ----------- ---------- ------------- -------------------------\n",
"GBM 120.308 291.899 285.059 19.141\n",
"DRF 117.633 388.756 385.696 88.24\n",
"GLM 862.816 879.256 875.882 0.449\n",
"DL 238.168 340.797 351.04 63.611"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 7 - Test/Train split again, model build again, this time with weather\n",
"split_fit_predict(bpd_with_weather)"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}