{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"import h2o\n",
"import time\n",
"from h2o.estimators.glm import H2OGeneralizedLinearEstimator\n",
"from h2o.estimators.gbm import H2OGradientBoostingEstimator\n",
"from h2o.estimators.random_forest import H2ORandomForestEstimator\n",
"from h2o.estimators.deeplearning import H2ODeepLearningEstimator"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Checking whether there is an H2O instance running at http://localhost:54321. connected.\n"
]
},
{
"data": {
"text/html": [
"
H2O cluster uptime: | \n",
"08 secs |
\n",
"H2O cluster version: | \n",
"3.11.0.99999 |
\n",
"H2O cluster version age: | \n",
"1 minute |
\n",
"H2O cluster name: | \n",
"pasha |
\n",
"H2O cluster total nodes: | \n",
"1 |
\n",
"H2O cluster free memory: | \n",
"3.556 Gb |
\n",
"H2O cluster total cores: | \n",
"8 |
\n",
"H2O cluster allowed cores: | \n",
"8 |
\n",
"H2O cluster status: | \n",
"accepting new members, healthy |
\n",
"H2O connection url: | \n",
"http://localhost:54321 |
\n",
"H2O connection proxy: | \n",
"None |
\n",
"Python version: | \n",
"3.5.2 final |
"
],
"text/plain": [
"-------------------------- ------------------------------\n",
"H2O cluster uptime: 08 secs\n",
"H2O cluster version: 3.11.0.99999\n",
"H2O cluster version age: 1 minute\n",
"H2O cluster name: pasha\n",
"H2O cluster total nodes: 1\n",
"H2O cluster free memory: 3.556 Gb\n",
"H2O cluster total cores: 8\n",
"H2O cluster allowed cores: 8\n",
"H2O cluster status: accepting new members, healthy\n",
"H2O connection url: http://localhost:54321\n",
"H2O connection proxy:\n",
"Python version: 3.5.2 final\n",
"-------------------------- ------------------------------"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Explore a typical Data Science workflow with H2O and Python\n",
"#\n",
"# Goal: assist the manager of CitiBike of NYC to load-balance the bicycles\n",
"# across the CitiBike network of stations, by predicting the number of bike\n",
"# trips taken from the station every day. Use 10 million rows of historical\n",
"# data, and eventually add weather data.\n",
"\n",
"\n",
"# Connect to a cluster\n",
"h2o.init()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.\n",
"\n",
"# Set this to True if you want to fetch the data directly from S3.\n",
"# This is useful if your cluster is running in EC2.\n",
"data_source_is_s3 = False\n",
"\n",
"def mylocate(s):\n",
" if data_source_is_s3:\n",
" return \"s3n://h2o-public-test-data/\" + s\n",
" else:\n",
" return _locate(s)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Import and Parse bike data\n",
"Parse progress: |█████████████████████████████████████████████████████████| 100%\n"
]
}
],
"source": [
"# Pick either the big or the small demo.\n",
"# Big data is 10M rows\n",
"small_test = [mylocate(\"bigdata/laptop/citibike-nyc/2013-10.csv\")]\n",
"big_test = [mylocate(\"bigdata/laptop/citibike-nyc/2013-07.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2013-08.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2013-09.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2013-10.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2013-11.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2013-12.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-01.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-02.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-03.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-04.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-05.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-06.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-07.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/2014-08.csv\")]\n",
"\n",
"# ----------\n",
"\n",
"# 1- Load data - 1 row per bicycle trip. Has columns showing the start and end\n",
"# station, trip duration and trip start time and day. The larger dataset\n",
"# totals about 10 million rows\n",
"print(\"Import and Parse bike data\")\n",
"data = h2o.import_file(path=small_test)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Rows:1037712\n",
"Cols:16\n",
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
" | tripduration | starttime | stoptime | start station id | start station name | start station latitude | start station longitude | end station id | end station name | end station latitude | end station longitude | bikeid | usertype | birth year | gender | Days |
\n",
"\n",
"\n",
"type | int | time | time | int | enum | real | real | int | enum | real | real | int | enum | int | int | int |
\n",
"mins | 60.0 | 1380585668000.0 | 1380585883000.0 | 72.0 | | 40.680342423 | -74.01713445 | 72.0 | | 40.680342423 | -74.01713445 | 14529.0 | | 1899.0 | 0.0 | 15979.0 |
\n",
"mean | 825.6147543827192 | 1381888516917.714 | 1381889342532.4746 | 443.7142126139049 | | 40.73451885864454 | -73.99113288482197 | 443.20742171238254 | | 40.73428478848875 | -73.99127029824423 | 17644.071645119242 | | 1975.7783948601839 | 1.123755916863252 | 15993.476745956474 |
\n",
"maxs | 1259480.0 | 1383263997000.0 | 1383393310000.0 | 3002.0 | | 40.770513 | -73.9500479759 | 3002.0 | | 40.770513 | -73.9500479759 | 20757.0 | | 1997.0 | 2.0 | 16009.0 |
\n",
"sigma | 2000.3732322961862 | 778871729.1323168 | 778847387.5037588 | 354.43432507453724 | | 0.01957340730530415 | 0.012316123410581171 | 357.39821705755827 | | 0.019557845811587957 | 0.012385581196537298 | 1717.6811213447866 | | 11.131490623834942 | 0.544380593291009 | 9.014533519116712 |
\n",
"zeros | 0 | 0 | 0 | 0 | | 0 | 0 | 0 | | 0 | 0 | 0 | | 0 | 97498 | 0 |
\n",
"missing | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 97445 | 0 | 0 |
\n",
"0 | 326.0 | 2013-10-01 00:01:08 | 2013-10-01 00:06:34 | 239.0 | Willoughby St & Fleet St | 40.69196566 | -73.9813018 | 366.0 | Clinton Ave & Myrtle Ave | 40.693261 | -73.968896 | 16052.0 | Subscriber | 1982.0 | 1.0 | 15979.0 |
\n",
"1 | 729.0 | 2013-10-01 00:01:21 | 2013-10-01 00:13:30 | 322.0 | Clinton St & Tillary St | 40.696191999999996 | -73.991218 | 398.0 | Atlantic Ave & Furman St | 40.69165183 | -73.99997859999999 | 19412.0 | Customer | nan | 0.0 | 15979.0 |
\n",
"2 | 520.0 | 2013-10-01 00:01:24 | 2013-10-01 00:10:04 | 174.0 | E 25 St & 1 Ave | 40.7381765 | -73.97738662 | 403.0 | E 2 St & 2 Ave | 40.72502876 | -73.99069656 | 19645.0 | Subscriber | 1984.0 | 1.0 | 15979.0 |
\n",
"3 | 281.0 | 2013-10-01 00:01:25 | 2013-10-01 00:06:06 | 430.0 | York St & Jay St | 40.7014851 | -73.98656928 | 323.0 | Lawrence St & Willoughby St | 40.69236178 | -73.98631746 | 16992.0 | Subscriber | 1985.0 | 1.0 | 15979.0 |
\n",
"4 | 196.0 | 2013-10-01 00:01:27 | 2013-10-01 00:04:43 | 403.0 | E 2 St & 2 Ave | 40.72502876 | -73.99069656 | 401.0 | Allen St & Rivington St | 40.72019576 | -73.98997825000001 | 15690.0 | Subscriber | 1986.0 | 1.0 | 15979.0 |
\n",
"5 | 1948.0 | 2013-10-01 00:01:48 | 2013-10-01 00:34:16 | 369.0 | Washington Pl & 6 Ave | 40.73224119 | -74.00026394 | 307.0 | Canal St & Rutgers St | 40.714274870000004 | -73.98990025 | 19846.0 | Subscriber | 1977.0 | 1.0 | 15979.0 |
\n",
"6 | 1327.0 | 2013-10-01 00:01:48 | 2013-10-01 00:23:55 | 254.0 | W 11 St & 6 Ave | 40.73532427 | -73.99800419 | 539.0 | Metropolitan Ave & Bedford Ave | 40.71534825 | -73.96024116 | 14563.0 | Subscriber | 1986.0 | 2.0 | 15979.0 |
\n",
"7 | 1146.0 | 2013-10-01 00:01:57 | 2013-10-01 00:21:03 | 490.0 | 8 Ave & W 33 St | 40.751551 | -73.993934 | 438.0 | St Marks Pl & 1 Ave | 40.727791260000004 | -73.98564945 | 16793.0 | Subscriber | 1959.0 | 1.0 | 15979.0 |
\n",
"8 | 380.0 | 2013-10-01 00:01:58 | 2013-10-01 00:08:18 | 468.0 | Broadway & W 55 St | 40.7652654 | -73.98192338 | 385.0 | E 55 St & 2 Ave | 40.757973220000004 | -73.96603308 | 16600.0 | Customer | nan | 0.0 | 15979.0 |
\n",
"9 | 682.0 | 2013-10-01 00:02:05 | 2013-10-01 00:13:27 | 300.0 | Shevchenko Pl & E 6 St | 40.728145 | -73.990214 | 519.0 | Pershing Square N | 40.75188406 | -73.97770164 | 15204.0 | Subscriber | 1992.0 | 1.0 | 15979.0 |
\n",
"\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# ----------\n",
"\n",
"# 2- light data munging: group the bike starts per-day, converting the 10M rows\n",
"# of trips to about 140,000 station&day combos - predicting the number of trip\n",
"# starts per-station-per-day.\n",
"\n",
"# Convert start time to: Day since the Epoch\n",
"startime = data[\"starttime\"]\n",
"secsPerDay = 1000 * 3600 * 24\n",
"data[\"Days\"] = (startime.asnumeric() / secsPerDay).floor()\n",
"data.describe()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
" Days | start station name | bikes |
\n",
"\n",
"\n",
" 15979 | 1 Ave & E 15 St | 173 |
\n",
" 15979 | 1 Ave & E 18 St | 118 |
\n",
" 15979 | 1 Ave & E 30 St | 152 |
\n",
" 15979 | 10 Ave & W 28 St | 115 |
\n",
" 15979 | 11 Ave & W 27 St | 210 |
\n",
" 15979 | 11 Ave & W 41 St | 106 |
\n",
" 15979 | 12 Ave & W 40 St | 144 |
\n",
" 15979 | 2 Ave & E 31 St | 206 |
\n",
" 15979 | 2 Ave & E 58 St | 105 |
\n",
" 15979 | 3 Ave & Schermerhorn St | 15 |
\n",
"\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Rows:10131\n",
"Cols:3\n",
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
" | Days | start station name | bikes |
\n",
"\n",
"\n",
"type | int | enum | int |
\n",
"mins | 15979.0 | | 1.0 |
\n",
"mean | 15993.953311617806 | | 102.42937518507551 |
\n",
"maxs | 16009.0 | | 603.0 |
\n",
"sigma | 8.950698111468864 | | 74.05933443246006 |
\n",
"zeros | 0 | | 0 |
\n",
"missing | 0 | 0 | 0 |
\n",
"0 | 15979.0 | 1 Ave & E 15 St | 173.0 |
\n",
"1 | 15979.0 | 1 Ave & E 18 St | 118.0 |
\n",
"2 | 15979.0 | 1 Ave & E 30 St | 152.0 |
\n",
"3 | 15979.0 | 10 Ave & W 28 St | 115.0 |
\n",
"4 | 15979.0 | 11 Ave & W 27 St | 210.0 |
\n",
"5 | 15979.0 | 11 Ave & W 41 St | 106.0 |
\n",
"6 | 15979.0 | 12 Ave & W 40 St | 144.0 |
\n",
"7 | 15979.0 | 2 Ave & E 31 St | 206.0 |
\n",
"8 | 15979.0 | 2 Ave & E 58 St | 105.0 |
\n",
"9 | 15979.0 | 3 Ave & Schermerhorn St | 15.0 |
\n",
"\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"[10131, 3]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Now do a monster Group-By. Count bike starts per-station per-day. Ends up\n",
"# with about 340 stations times 400 days (140,000 rows). This is what we want\n",
"# to predict.\n",
"grouped = data.group_by([\"Days\",\"start station name\"])\n",
"bpd = grouped.count().get_frame() # Compute bikes-per-day\n",
"bpd.set_name(2,\"bikes\")\n",
"bpd.show()\n",
"bpd.describe()\n",
"bpd.dim"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Quantiles of bikes-per-day\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
" Probs | bikesQuantiles |
\n",
"\n",
"\n",
" 0.01 | 5 |
\n",
" 0.1 | 20 |
\n",
" 0.25 | 45 |
\n",
" 0.333 | 60 |
\n",
" 0.5 | 91 |
\n",
" 0.667 | 121 |
\n",
" 0.75 | 141 |
\n",
" 0.9 | 197 |
\n",
" 0.99 | 340.4 |
\n",
"\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Quantiles: the data is fairly unbalanced; some station/day combos are wildly\n",
"# more popular than others.\n",
"print(\"Quantiles of bikes-per-day\")\n",
"bpd[\"bikes\"].quantile().show()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Bikes-Per-Day\n",
"Rows:10131\n",
"Cols:5\n",
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
" | Days | start station name | bikes | Month | DayOfWeek |
\n",
"\n",
"\n",
"type | int | enum | int | enum | enum |
\n",
"mins | 15979.0 | | 1.0 | | |
\n",
"mean | 15993.953311617806 | | 102.42937518507551 | | |
\n",
"maxs | 16009.0 | | 603.0 | | |
\n",
"sigma | 8.950698111468864 | | 74.05933443246006 | | |
\n",
"zeros | 0 | | 0 | | |
\n",
"missing | 0 | 0 | 0 | 0 | 0 |
\n",
"0 | 15979.0 | 1 Ave & E 15 St | 173.0 | 10 | Tue |
\n",
"1 | 15979.0 | 1 Ave & E 18 St | 118.0 | 10 | Tue |
\n",
"2 | 15979.0 | 1 Ave & E 30 St | 152.0 | 10 | Tue |
\n",
"3 | 15979.0 | 10 Ave & W 28 St | 115.0 | 10 | Tue |
\n",
"4 | 15979.0 | 11 Ave & W 27 St | 210.0 | 10 | Tue |
\n",
"5 | 15979.0 | 11 Ave & W 41 St | 106.0 | 10 | Tue |
\n",
"6 | 15979.0 | 12 Ave & W 40 St | 144.0 | 10 | Tue |
\n",
"7 | 15979.0 | 2 Ave & E 31 St | 206.0 | 10 | Tue |
\n",
"8 | 15979.0 | 2 Ave & E 58 St | 105.0 | 10 | Tue |
\n",
"9 | 15979.0 | 3 Ave & Schermerhorn St | 15.0 | 10 | Tue |
\n",
"\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# A little feature engineering\n",
"# Add in month-of-year (seasonality; fewer bike rides in winter than summer)\n",
"secs = bpd[\"Days\"]*secsPerDay\n",
"bpd[\"Month\"] = secs.month().asfactor()\n",
"# Add in day-of-week (work-week; more bike rides on Sunday than Monday)\n",
"bpd[\"DayOfWeek\"] = secs.dayOfWeek()\n",
"print(\"Bikes-Per-Day\")\n",
"bpd.describe()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# ----------\n",
"# 3- Fit a model on train; using test as validation\n",
"\n",
"# Function for doing class test/train/holdout split\n",
"def split_fit_predict(data):\n",
" global gbm0,drf0,glm0,dl0\n",
" # Classic Test/Train split\n",
" r = data['Days'].runif() # Random UNIForm numbers, one per row\n",
" train = data[ r < 0.6]\n",
" test = data[(0.6 <= r) & (r < 0.9)]\n",
" hold = data[ 0.9 <= r ]\n",
" print(\"Training data has\",train.ncol,\"columns and\",train.nrow,\"rows, test has\",test.nrow,\"rows, holdout has\",hold.nrow)\n",
" bike_names_x = data.names\n",
" bike_names_x.remove(\"bikes\")\n",
" \n",
" # Run GBM\n",
" s = time.time()\n",
" \n",
" gbm0 = H2OGradientBoostingEstimator(ntrees=500, # 500 works well\n",
" max_depth=6,\n",
" learn_rate=0.1)\n",
" \n",
"\n",
" gbm0.train(x =bike_names_x,\n",
" y =\"bikes\",\n",
" training_frame =train,\n",
" validation_frame=test)\n",
"\n",
" gbm_elapsed = time.time() - s\n",
"\n",
" # Run DRF\n",
" s = time.time()\n",
" \n",
" drf0 = H2ORandomForestEstimator(ntrees=250, max_depth=30)\n",
"\n",
" drf0.train(x =bike_names_x,\n",
" y =\"bikes\",\n",
" training_frame =train,\n",
" validation_frame=test)\n",
" \n",
" drf_elapsed = time.time() - s \n",
" \n",
" \n",
" # Run GLM\n",
" if \"WC1\" in bike_names_x: bike_names_x.remove(\"WC1\")\n",
" s = time.time()\n",
"\n",
" glm0 = H2OGeneralizedLinearEstimator(Lambda=[1e-5], family=\"poisson\")\n",
" \n",
" glm0.train(x =bike_names_x,\n",
" y =\"bikes\",\n",
" training_frame =train,\n",
" validation_frame=test)\n",
"\n",
" glm_elapsed = time.time() - s\n",
" \n",
" # Run DL\n",
" s = time.time()\n",
"\n",
" dl0 = H2ODeepLearningEstimator(hidden=[50,50,50,50], epochs=50)\n",
" \n",
" dl0.train(x =bike_names_x,\n",
" y =\"bikes\",\n",
" training_frame =train,\n",
" validation_frame=test)\n",
" \n",
" dl_elapsed = time.time() - s\n",
" \n",
" # ----------\n",
" # 4- Score on holdout set & report\n",
" train_mse_gbm = gbm0.model_performance(train).mse()\n",
" test_mse_gbm = gbm0.model_performance(test ).mse()\n",
" hold_mse_gbm = gbm0.model_performance(hold ).mse()\n",
"# print \"GBM mse TRAIN=\",train_mse_gbm,\", mse TEST=\",test_mse_gbm,\", mse HOLDOUT=\",hold_mse_gbm\n",
" \n",
" train_mse_drf = drf0.model_performance(train).mse()\n",
" test_mse_drf = drf0.model_performance(test ).mse()\n",
" hold_mse_drf = drf0.model_performance(hold ).mse()\n",
"# print \"DRF mse TRAIN=\",train_mse_drf,\", mse TEST=\",test_mse_drf,\", mse HOLDOUT=\",hold_mse_drf\n",
" \n",
" train_mse_glm = glm0.model_performance(train).mse()\n",
" test_mse_glm = glm0.model_performance(test ).mse()\n",
" hold_mse_glm = glm0.model_performance(hold ).mse()\n",
"# print \"GLM mse TRAIN=\",train_mse_glm,\", mse TEST=\",test_mse_glm,\", mse HOLDOUT=\",hold_mse_glm\n",
" \n",
" train_mse_dl = dl0.model_performance(train).mse()\n",
" test_mse_dl = dl0.model_performance(test ).mse()\n",
" hold_mse_dl = dl0.model_performance(hold ).mse()\n",
"# print \" DL mse TRAIN=\",train_mse_dl,\", mse TEST=\",test_mse_dl,\", mse HOLDOUT=\",hold_mse_dl\n",
" \n",
" # make a pretty HTML table printout of the results\n",
"\n",
" header = [\"Model\", \"mse TRAIN\", \"mse TEST\", \"mse HOLDOUT\", \"Model Training Time (s)\"]\n",
" table = [\n",
" [\"GBM\", train_mse_gbm, test_mse_gbm, hold_mse_gbm, round(gbm_elapsed,3)],\n",
" [\"DRF\", train_mse_drf, test_mse_drf, hold_mse_drf, round(drf_elapsed,3)],\n",
" [\"GLM\", train_mse_glm, test_mse_glm, hold_mse_glm, round(glm_elapsed,3)],\n",
" [\"DL \", train_mse_dl, test_mse_dl, hold_mse_dl , round(dl_elapsed,3) ],\n",
" ]\n",
" h2o.display.H2ODisplay(table,header)\n",
" # --------------"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training data has 5 columns and 6180 rows, test has 2947 rows, holdout has 1004\n",
"gbm Model Build progress: |███████████████████████████████████████████████| 100%\n",
"drf Model Build progress: |███████████████████████████████████████████████| 100%\n",
"glm Model Build progress: |███████████████████████████████████████████████| 100%\n",
"deeplearning Model Build progress: |██████████████████████████████████████| 100%\n"
]
},
{
"data": {
"text/html": [
"Model | \n",
"mse TRAIN | \n",
"mse TEST | \n",
"mse HOLDOUT | \n",
"Model Training Time (s) |
\n",
"GBM | \n",
"0.8948171 | \n",
"386.7584398 | \n",
"428.7237120 | \n",
"7.759 |
\n",
"DRF | \n",
"526.3541524 | \n",
"921.4867812 | \n",
"916.5091361 | \n",
"8.673 |
\n",
"GLM | \n",
"689.6647078 | \n",
"757.4271445 | \n",
"726.9764530 | \n",
"0.522 |
\n",
"DL | \n",
"307.5692122 | \n",
"459.6025357 | \n",
"509.2822086 | \n",
"8.619 |
"
],
"text/plain": [
"Model mse TRAIN mse TEST mse HOLDOUT Model Training Time (s)\n",
"------- ----------- ---------- ------------- -------------------------\n",
"GBM 0.894817 386.758 428.724 7.759\n",
"DRF 526.354 921.487 916.509 8.673\n",
"GLM 689.665 757.427 726.976 0.522\n",
"DL 307.569 459.603 509.282 8.619"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Split the data (into test & train), fit some models and predict on the holdout data\n",
"split_fit_predict(bpd)\n",
"# Here we see an r^2 of 0.91 for GBM, and 0.71 for GLM. This means given just\n",
"# the station, the month, and the day-of-week we can predict 90% of the\n",
"# variance of the bike-trip-starts."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Parse progress: |█████████████████████████████████████████████████████████| 100%\n",
"Rows:17520\n",
"Cols:50\n",
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
" | Year Local | Month Local | Day Local | Hour Local | Year UTC | Month UTC | Day UTC | Hour UTC | Cavok Reported | Cloud Ceiling (m) | Cloud Cover Fraction | Cloud Cover Fraction 1 | Cloud Cover Fraction 2 | Cloud Cover Fraction 3 | Cloud Cover Fraction 4 | Cloud Cover Fraction 5 | Cloud Cover Fraction 6 | Cloud Height (m) 1 | Cloud Height (m) 2 | Cloud Height (m) 3 | Cloud Height (m) 4 | Cloud Height (m) 5 | Cloud Height (m) 6 | Dew Point (C) | Humidity Fraction | Precipitation One Hour (mm) | Pressure Altimeter (mbar) | Pressure Sea Level (mbar) | Pressure Station (mbar) | Snow Depth (cm) | Temperature (C) | Visibility (km) | Weather Code 1 | Weather Code 1/ Description | Weather Code 2 | Weather Code 2/ Description | Weather Code 3 | Weather Code 3/ Description | Weather Code 4 | Weather Code 4/ Description | Weather Code 5 | Weather Code 5/ Description | Weather Code 6 | Weather Code 6/ Description | Weather Code Most Severe / Icon Code | Weather Code Most Severe | Weather Code Most Severe / Description | Wind Direction (degrees) | Wind Gust (m/s) | Wind Speed (m/s) |
\n",
"\n",
"\n",
"type | int | int | int | int | int | int | int | int | int | real | real | real | real | real | int | int | int | real | real | real | int | int | int | real | real | real | real | int | int | int | real | real | int | enum | int | enum | int | enum | int | enum | int | enum | int | enum | int | int | enum | int | real | real |
\n",
"mins | 2013.0 | 1.0 | 1.0 | 0.0 | 2013.0 | 1.0 | 1.0 | 0.0 | 0.0 | 61.0 | 0.0 | 0.0 | 0.25 | 0.5 | NaN | NaN | NaN | 60.96 | 213.36 | 365.76 | NaN | NaN | NaN | -26.700000000000003 | 0.12510000000000002 | 0.0 | 983.2949000000001 | NaN | NaN | NaN | -15.600000000000001 | 0.001 | 1.0 | | 1.0 | | 1.0 | | 1.0 | | 1.0 | | 3.0 | | 0.0 | 1.0 | | 10.0 | 7.2 | 0.0 |
\n",
"mean | 2013.5 | 6.5260273972602745 | 15.72054794520548 | 11.500000000000004 | 2013.5005707762557 | 6.525114155251141 | 15.72134703196347 | 11.500114155251142 | 0.0 | 1306.3119584569736 | 0.4167424905220181 | 0.3612073490813649 | 0.8724453840732911 | 0.9630456852791879 | 0.0 | 0.0 | 0.0 | 1293.9822681953192 | 1643.7390016566796 | 2084.8938637563456 | 0.0 | 0.0 | 0.0 | 4.313046467655992 | 0.5967363891594567 | 1.3799301075268817 | 1017.8258144055944 | 0.0 | 0.0 | 0.0 | 12.578909070073914 | 14.391442968202009 | 4.84251968503937 | | 3.6586768935762226 | | 2.8466076696165192 | | 2.0114942528735633 | | 4.125 | | 3.0 | | 1.3784817351598173 | 4.84251968503937 | | 194.69525681985743 | 9.422169480726348 | 2.4103288784874057 |
\n",
"maxs | 2014.0 | 12.0 | 31.0 | 23.0 | 2015.0 | 12.0 | 31.0 | 23.0 | 0.0 | 3657.6000000000004 | 1.0 | 1.0 | 1.0 | 1.0 | NaN | NaN | NaN | 3657.5999 | 3657.5999 | 3657.5999 | NaN | NaN | NaN | 24.400000000000002 | 1.0 | 26.924 | 1042.2113 | NaN | NaN | NaN | 36.1 | 16.0934 | 60.0 | | 60.0 | | 36.0 | | 27.0 | | 27.0 | | 3.0 | | 16.0 | 60.0 | | 360.0 | 20.580000000000002 | 10.8 |
\n",
"sigma | 0.500014270017262 | 3.447949723847773 | 8.796498048523272 | 6.922384111875021 | 0.50058441171579 | 3.447824054577647 | 8.795614888684717 | 6.922301652025526 | 0.0 | 995.3398569657211 | 0.4627208309925301 | 0.42770569708047684 | 0.19715569036704708 | 0.08610155981044185 | -0.0 | -0.0 | -0.0 | 962.7430958537232 | 916.7386134899587 | 887.2158475113932 | -0.0 | -0.0 | -0.0 | 10.973128209713666 | 0.18579201186573496 | 2.5621512917896463 | 7.464516971789659 | -0.0 | -0.0 | -0.0 | 10.039673953091574 | 3.6989362303340494 | 5.704865769828319 | | 6.133862539123368 | | 5.805532863642112 | | 3.1234084426128437 | | 6.15223536610881 | | 0.0 | | 4.073860627017756 | 5.704865769828319 | | 106.3500000314393 | 1.8151187111524154 | 1.614697905241178 |
\n",
"zeros | 0 | 0 | 0 | 730 | 0 | 0 | 0 | 730 | 17455 | 0 | 8758 | 8758 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 268 | 0 | 501 | 0 | 0 | 0 | 0 | 269 | 0 | 0 | | 0 | | 0 | | 0 | | 0 | | 0 | | 14980 | 0 | | 0 | 0 | 2768 |
\n",
"missing | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 65 | 10780 | 375 | 375 | 14682 | 16535 | 17520 | 17520 | 17520 | 9103 | 14683 | 16535 | 17520 | 17520 | 17520 | 67 | 67 | 15660 | 360 | 17520 | 17520 | 17520 | 67 | 412 | 14980 | 14980 | 16477 | 16477 | 17181 | 17181 | 17433 | 17433 | 17504 | 17504 | 17518 | 17518 | 0 | 14980 | 14980 | 9382 | 14381 | 1283 |
\n",
"0 | 2013.0 | 1.0 | 1.0 | 0.0 | 2013.0 | 1.0 | 1.0 | 5.0 | 0.0 | 2895.6000000000004 | 1.0 | 0.9 | 1.0 | nan | nan | nan | nan | 2895.5999 | 3352.8 | nan | nan | nan | nan | -5.0 | 0.5447000000000001 | nan | 1013.0917000000001 | nan | nan | nan | 3.3000000000000003 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | nan | nan | 2.57 |
\n",
"1 | 2013.0 | 1.0 | 1.0 | 1.0 | 2013.0 | 1.0 | 1.0 | 6.0 | 0.0 | 3048.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 3048.0 | nan | nan | nan | nan | nan | -4.4 | 0.5463 | nan | 1012.0759 | nan | nan | nan | 3.9000000000000004 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | 260.0 | 9.77 | 4.63 |
\n",
"2 | 2013.0 | 1.0 | 1.0 | 2.0 | 2013.0 | 1.0 | 1.0 | 7.0 | 0.0 | 1828.8000000000002 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1828.7999 | nan | nan | nan | nan | nan | -3.3000000000000003 | 0.619 | nan | 1012.4145000000001 | nan | nan | nan | 3.3000000000000003 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | nan | 7.72 | 1.54 |
\n",
"3 | 2013.0 | 1.0 | 1.0 | 3.0 | 2013.0 | 1.0 | 1.0 | 8.0 | 0.0 | 1463.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1463.04 | nan | nan | nan | nan | nan | -2.8000000000000003 | 0.6159 | nan | 1012.4145000000001 | nan | nan | nan | 3.9000000000000004 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | nan | nan | 3.09 |
\n",
"4 | 2013.0 | 1.0 | 1.0 | 4.0 | 2013.0 | 1.0 | 1.0 | 9.0 | 0.0 | 1402.1000000000001 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1402.0800000000002 | nan | nan | nan | nan | nan | -2.8000000000000003 | 0.6159 | nan | 1012.7531 | nan | nan | nan | 3.9000000000000004 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | 260.0 | nan | 4.12 |
\n",
"5 | 2013.0 | 1.0 | 1.0 | 5.0 | 2013.0 | 1.0 | 1.0 | 10.0 | 0.0 | 1524.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1524.0 | nan | nan | nan | nan | nan | -2.8000000000000003 | 0.6159 | nan | 1012.4145000000001 | nan | nan | nan | 3.9000000000000004 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | nan | nan | 3.09 |
\n",
"6 | 2013.0 | 1.0 | 1.0 | 6.0 | 2013.0 | 1.0 | 1.0 | 11.0 | 0.0 | 1524.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1524.0 | nan | nan | nan | nan | nan | -3.3000000000000003 | 0.5934 | nan | 1012.0759 | nan | nan | nan | 3.9000000000000004 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | nan | 9.26 | 3.09 |
\n",
"7 | 2013.0 | 1.0 | 1.0 | 7.0 | 2013.0 | 1.0 | 1.0 | 12.0 | 0.0 | 1524.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1524.0 | nan | nan | nan | nan | nan | -3.3000000000000003 | 0.5934 | nan | 1012.4145000000001 | nan | nan | nan | 3.9000000000000004 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | 260.0 | 9.26 | 4.63 |
\n",
"8 | 2013.0 | 1.0 | 1.0 | 8.0 | 2013.0 | 1.0 | 1.0 | 13.0 | 0.0 | 1524.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1524.0 | nan | nan | nan | nan | nan | -2.8000000000000003 | 0.6425000000000001 | nan | 1012.4145000000001 | nan | nan | nan | 3.3000000000000003 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | 260.0 | nan | 3.09 |
\n",
"9 | 2013.0 | 1.0 | 1.0 | 9.0 | 2013.0 | 1.0 | 1.0 | 14.0 | 0.0 | 1524.0 | 1.0 | 0.9 | 1.0 | nan | nan | nan | nan | 1524.0 | 3657.5999 | nan | nan | nan | nan | -2.8000000000000003 | 0.6159 | nan | 1012.4145000000001 | nan | nan | nan | 3.9000000000000004 | 16.0934 | nan | | nan | | nan | | nan | | nan | | nan | | 0.0 | nan | | nan | 9.26 | 3.09 |
\n",
"\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# ----------\n",
"# 5- Now lets add some weather\n",
"# Load weather data\n",
"wthr1 = h2o.import_file(path=[mylocate(\"bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv\"),\n",
" mylocate(\"bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv\")])\n",
"# Peek at the data\n",
"wthr1.describe()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Rows:17520\n",
"Cols:9\n",
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
" | Year Local | Month Local | Day Local | Hour Local | Dew Point (C) | Humidity Fraction | Rain (mm) | Temperature (C) | WC1 |
\n",
"\n",
"\n",
"type | int | int | int | int | real | real | real | real | enum |
\n",
"mins | 2013.0 | 1.0 | 1.0 | 0.0 | -26.700000000000003 | 0.12510000000000002 | 0.0 | -15.600000000000001 | |
\n",
"mean | 2013.5 | 6.5260273972602745 | 15.72054794520548 | 11.500000000000004 | 4.313046467655992 | 0.5967363891594567 | 1.3799301075268817 | 12.578909070073914 | |
\n",
"maxs | 2014.0 | 12.0 | 31.0 | 23.0 | 24.400000000000002 | 1.0 | 26.924 | 36.1 | |
\n",
"sigma | 0.500014270017262 | 3.447949723847773 | 8.796498048523272 | 6.922384111875021 | 10.973128209713666 | 0.18579201186573496 | 2.5621512917896463 | 10.039673953091574 | |
\n",
"zeros | 0 | 0 | 0 | 730 | 268 | 0 | 501 | 269 | |
\n",
"missing | 0 | 0 | 0 | 0 | 67 | 67 | 15660 | 67 | 14980 |
\n",
"0 | 2013.0 | 1.0 | 1.0 | 0.0 | -5.0 | 0.5447000000000001 | nan | 3.3000000000000003 | |
\n",
"1 | 2013.0 | 1.0 | 1.0 | 1.0 | -4.4 | 0.5463 | nan | 3.9000000000000004 | |
\n",
"2 | 2013.0 | 1.0 | 1.0 | 2.0 | -3.3000000000000003 | 0.619 | nan | 3.3000000000000003 | |
\n",
"3 | 2013.0 | 1.0 | 1.0 | 3.0 | -2.8000000000000003 | 0.6159 | nan | 3.9000000000000004 | |
\n",
"4 | 2013.0 | 1.0 | 1.0 | 4.0 | -2.8000000000000003 | 0.6159 | nan | 3.9000000000000004 | |
\n",
"5 | 2013.0 | 1.0 | 1.0 | 5.0 | -2.8000000000000003 | 0.6159 | nan | 3.9000000000000004 | |
\n",
"6 | 2013.0 | 1.0 | 1.0 | 6.0 | -3.3000000000000003 | 0.5934 | nan | 3.9000000000000004 | |
\n",
"7 | 2013.0 | 1.0 | 1.0 | 7.0 | -3.3000000000000003 | 0.5934 | nan | 3.9000000000000004 | |
\n",
"8 | 2013.0 | 1.0 | 1.0 | 8.0 | -2.8000000000000003 | 0.6425000000000001 | nan | 3.3000000000000003 | |
\n",
"9 | 2013.0 | 1.0 | 1.0 | 9.0 | -2.8000000000000003 | 0.6159 | nan | 3.9000000000000004 | |
\n",
"\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Lots of columns in there! Lets plan on converting to time-since-epoch to do\n",
"# a 'join' with the bike data, plus gather weather info that might affect\n",
"# cyclists - rain, snow, temperature. Alas, drop the \"snow\" column since it's\n",
"# all NA's. Also add in dew point and humidity just in case. Slice out just\n",
"# the columns of interest and drop the rest.\n",
"wthr2 = wthr1[[\"Year Local\",\"Month Local\",\"Day Local\",\"Hour Local\",\"Dew Point (C)\",\"Humidity Fraction\",\"Precipitation One Hour (mm)\",\"Temperature (C)\",\"Weather Code 1/ Description\"]]\n",
"\n",
"wthr2.set_name(wthr2.names.index(\"Precipitation One Hour (mm)\"), \"Rain (mm)\")\n",
"wthr2.set_name(wthr2.names.index(\"Weather Code 1/ Description\"), \"WC1\")\n",
"wthr2.describe()\n",
"# Much better! "
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Filter down to the weather at Noon\n",
"wthr3 = wthr2[ wthr2[\"Hour Local\"]==12 ]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Rows:730\n",
"Cols:11\n",
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
" | Year Local | Month Local | Day Local | Hour Local | Dew Point (C) | Humidity Fraction | Rain (mm) | Temperature (C) | WC1 | msec | Days |
\n",
"\n",
"\n",
"type | int | int | int | int | real | real | real | real | enum | int | int |
\n",
"mins | 2013.0 | 1.0 | 1.0 | 12.0 | -26.700000000000003 | 0.1723 | 0.0 | -13.9 | | 1357070400000.0 | 15706.0 |
\n",
"mean | 2013.5 | 6.526027397260274 | 15.72054794520548 | 12.0 | 4.230123796423659 | 0.539728198074278 | 1.5312571428571429 | 14.068775790921595 | | 1388560852602.7397 | 16070.5 |
\n",
"maxs | 2014.0 | 12.0 | 31.0 | 12.0 | 23.3 | 1.0 | 12.446 | 34.4 | | 1420056000000.0 | 16435.0 |
\n",
"sigma | 0.5003428180039172 | 3.450215293068149 | 8.802278027009615 | 0.0 | 11.106296472475226 | 0.17994502792324327 | 2.3606424861490587 | 10.398985514891212 | | 18219740080.410755 | 210.87713642466474 |
\n",
"zeros | 0 | 0 | 0 | 0 | 14 | 0 | 15 | 7 | | 0 | 0 |
\n",
"missing | 0 | 0 | 0 | 0 | 3 | 3 | 660 | 3 | 620 | 0 | 0 |
\n",
"0 | 2013.0 | 1.0 | 1.0 | 12.0 | -3.3000000000000003 | 0.5934 | nan | 3.9000000000000004 | | 1357070400000.0 | 15706.0 |
\n",
"1 | 2013.0 | 1.0 | 2.0 | 12.0 | -11.700000000000001 | 0.4806 | nan | -2.2 | | 1357156800000.0 | 15707.0 |
\n",
"2 | 2013.0 | 1.0 | 3.0 | 12.0 | -10.600000000000001 | 0.5248 | nan | -2.2 | | 1357243200000.0 | 15708.0 |
\n",
"3 | 2013.0 | 1.0 | 4.0 | 12.0 | -7.2 | 0.49760000000000004 | nan | 2.2 | | 1357329600000.0 | 15709.0 |
\n",
"4 | 2013.0 | 1.0 | 5.0 | 12.0 | -7.2 | 0.42600000000000005 | nan | 4.4 | | 1357416000000.0 | 15710.0 |
\n",
"5 | 2013.0 | 1.0 | 6.0 | 12.0 | -1.7000000000000002 | 0.6451 | nan | 4.4 | haze | 1357502400000.0 | 15711.0 |
\n",
"6 | 2013.0 | 1.0 | 7.0 | 12.0 | -6.1000000000000005 | 0.41190000000000004 | nan | 6.1000000000000005 | | 1357588800000.0 | 15712.0 |
\n",
"7 | 2013.0 | 1.0 | 8.0 | 12.0 | -1.7000000000000002 | 0.5314 | nan | 7.2 | | 1357675200000.0 | 15713.0 |
\n",
"8 | 2013.0 | 1.0 | 9.0 | 12.0 | 0.6000000000000001 | 0.56 | nan | 8.9 | haze | 1357761600000.0 | 15714.0 |
\n",
"9 | 2013.0 | 1.0 | 10.0 | 12.0 | -6.1000000000000005 | 0.3952 | nan | 6.7 | | 1357848000000.0 | 15715.0 |
\n",
"\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Lets now get Days since the epoch... we'll convert year/month/day into Epoch\n",
"# time, and then back to Epoch days. Need zero-based month and days, but have\n",
"# 1-based.\n",
"wthr3[\"msec\"] = h2o.H2OFrame.mktime(year=wthr3[\"Year Local\"], month=wthr3[\"Month Local\"]-1, day=wthr3[\"Day Local\"]-1, hour=wthr3[\"Hour Local\"])\n",
"secsPerDay=1000*60*60*24\n",
"wthr3[\"Days\"] = (wthr3[\"msec\"]/secsPerDay).floor()\n",
"wthr3.describe()\n",
"# msec looks sane (numbers like 1.3e12 are in the correct range for msec since\n",
"# 1970). Epoch Days matches closely with the epoch day numbers from the\n",
"# CitiBike dataset. "
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Lets drop off the extra time columns to make a easy-to-handle dataset.\n",
"wthr4 = wthr3.drop(\"Year Local\").drop(\"Month Local\").drop(\"Day Local\").drop(\"Hour Local\").drop(\"msec\")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Also, most rain numbers are missing - lets assume those are zero rain days\n",
"rain = wthr4[\"Rain (mm)\"]\n",
"rain[ rain.isna() ] = 0\n",
"wthr4[\"Rain (mm)\"] = rain"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Merge Daily Weather with Bikes-Per-Day\n",
"Rows:10131\n",
"Cols:10\n",
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
" | Days | start station name | bikes | Month | DayOfWeek | Dew Point (C) | Humidity Fraction | Rain (mm) | Temperature (C) | WC1 |
\n",
"\n",
"\n",
"type | int | enum | int | enum | enum | real | real | real | real | enum |
\n",
"mins | 15979.0 | | 1.0 | | | -2.2 | 0.34850000000000003 | 0.0 | 9.4 | |
\n",
"mean | 15993.953311617806 | | 102.42937518507551 | | | 7.60732405488106 | 0.5564958839206396 | 0.008198400947586611 | 16.937094067712962 | |
\n",
"maxs | 16009.0 | | 603.0 | | | 19.400000000000002 | 0.8718 | 0.254 | 26.1 | |
\n",
"sigma | 8.950698111468864 | | 74.05933443246006 | | | 6.516386487040385 | 0.14811201086649933 | 0.04489297266255909 | 4.362687300129602 | |
\n",
"zeros | 0 | | 0 | | | 0 | 0 | 9804 | 0 | |
\n",
"missing | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8816 |
\n",
"0 | 15979.0 | 1 Ave & E 15 St | 173.0 | 10 | Tue | 10.600000000000001 | 0.4315 | 0.0 | 23.900000000000002 | |
\n",
"1 | 15979.0 | 1 Ave & E 18 St | 118.0 | 10 | Tue | 10.600000000000001 | 0.4315 | 0.0 | 23.900000000000002 | |
\n",
"2 | 15979.0 | 1 Ave & E 30 St | 152.0 | 10 | Tue | 10.600000000000001 | 0.4315 | 0.0 | 23.900000000000002 | |
\n",
"3 | 15979.0 | 10 Ave & W 28 St | 115.0 | 10 | Tue | 10.600000000000001 | 0.4315 | 0.0 | 23.900000000000002 | |
\n",
"4 | 15979.0 | 11 Ave & W 27 St | 210.0 | 10 | Tue | 10.600000000000001 | 0.4315 | 0.0 | 23.900000000000002 | |
\n",
"5 | 15979.0 | 11 Ave & W 41 St | 106.0 | 10 | Tue | 10.600000000000001 | 0.4315 | 0.0 | 23.900000000000002 | |
\n",
"6 | 15979.0 | 12 Ave & W 40 St | 144.0 | 10 | Tue | 10.600000000000001 | 0.4315 | 0.0 | 23.900000000000002 | |
\n",
"7 | 15979.0 | 2 Ave & E 31 St | 206.0 | 10 | Tue | 10.600000000000001 | 0.4315 | 0.0 | 23.900000000000002 | |
\n",
"8 | 15979.0 | 2 Ave & E 58 St | 105.0 | 10 | Tue | 10.600000000000001 | 0.4315 | 0.0 | 23.900000000000002 | |
\n",
"9 | 15979.0 | 3 Ave & Schermerhorn St | 15.0 | 10 | Tue | 10.600000000000001 | 0.4315 | 0.0 | 23.900000000000002 | |
\n",
"\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n",
" Days | start station name | bikes | Month | DayOfWeek | Dew Point (C) | Humidity Fraction | Rain (mm) | Temperature (C) | WC1 |
\n",
"\n",
"\n",
" 15979 | 1 Ave & E 15 St | 173 | 10 | Tue | 10.6 | 0.4315 | 0 | 23.9 | |
\n",
" 15979 | 1 Ave & E 18 St | 118 | 10 | Tue | 10.6 | 0.4315 | 0 | 23.9 | |
\n",
" 15979 | 1 Ave & E 30 St | 152 | 10 | Tue | 10.6 | 0.4315 | 0 | 23.9 | |
\n",
" 15979 | 10 Ave & W 28 St | 115 | 10 | Tue | 10.6 | 0.4315 | 0 | 23.9 | |
\n",
" 15979 | 11 Ave & W 27 St | 210 | 10 | Tue | 10.6 | 0.4315 | 0 | 23.9 | |
\n",
" 15979 | 11 Ave & W 41 St | 106 | 10 | Tue | 10.6 | 0.4315 | 0 | 23.9 | |
\n",
" 15979 | 12 Ave & W 40 St | 144 | 10 | Tue | 10.6 | 0.4315 | 0 | 23.9 | |
\n",
" 15979 | 2 Ave & E 31 St | 206 | 10 | Tue | 10.6 | 0.4315 | 0 | 23.9 | |
\n",
" 15979 | 2 Ave & E 58 St | 105 | 10 | Tue | 10.6 | 0.4315 | 0 | 23.9 | |
\n",
" 15979 | 3 Ave & Schermerhorn St | 15 | 10 | Tue | 10.6 | 0.4315 | 0 | 23.9 | |
\n",
"\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# ----------\n",
"# 6 - Join the weather data-per-day to the bike-starts-per-day\n",
"print(\"Merge Daily Weather with Bikes-Per-Day\")\n",
"bpd_with_weather = bpd.merge(wthr4,all_x=True,all_y=False)\n",
"bpd_with_weather.describe()\n",
"bpd_with_weather.show()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training data has 10 columns and 6066 rows, test has 3044 rows, holdout has 1021\n",
"gbm Model Build progress: |███████████████████████████████████████████████| 100%\n",
"drf Model Build progress: |███████████████████████████████████████████████| 100%\n",
"glm Model Build progress: |███████████████████████████████████████████████| 100%\n",
"deeplearning Model Build progress: |██████████████████████████████████████| 100%\n"
]
},
{
"data": {
"text/html": [
"Model | \n",
"mse TRAIN | \n",
"mse TEST | \n",
"mse HOLDOUT | \n",
"Model Training Time (s) |
\n",
"GBM | \n",
"0.2159977 | \n",
"393.0248269 | \n",
"404.2520310 | \n",
"8.679 |
\n",
"DRF | \n",
"804.2152039 | \n",
"1703.1540562 | \n",
"1782.0854925 | \n",
"6.573 |
\n",
"GLM | \n",
"620.8814844 | \n",
"735.9622856 | \n",
"789.7891737 | \n",
"0.241 |
\n",
"DL | \n",
"213.8582644 | \n",
"454.7871732 | \n",
"476.5995571 | \n",
"7.518 |
"
],
"text/plain": [
"Model mse TRAIN mse TEST mse HOLDOUT Model Training Time (s)\n",
"------- ----------- ---------- ------------- -------------------------\n",
"GBM 0.215998 393.025 404.252 8.679\n",
"DRF 804.215 1703.15 1782.09 6.573\n",
"GLM 620.881 735.962 789.789 0.241\n",
"DL 213.858 454.787 476.6 7.518"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# 7 - Test/Train split again, model build again, this time with weather\n",
"split_fit_predict(bpd_with_weather)"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}