{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import h2o\n", "import time" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
H2O cluster uptime: 1 hours 58 minutes 9 seconds 765 milliseconds
H2O cluster version: 3.1.0.99999
H2O cluster name: spencer
H2O cluster total nodes: 1
H2O cluster total memory: 14.22 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: True
H2O Connection ip: 127.0.0.1
H2O Connection port: 54321
" ], "text/plain": [ "-------------------------- ---------------------------------------------\n", "H2O cluster uptime: 1 hours 58 minutes 9 seconds 765 milliseconds\n", "H2O cluster version: 3.1.0.99999\n", "H2O cluster name: spencer\n", "H2O cluster total nodes: 1\n", "H2O cluster total memory: 14.22 GB\n", "H2O cluster total cores: 8\n", "H2O cluster allowed cores: 8\n", "H2O cluster healthy: True\n", "H2O Connection ip: 127.0.0.1\n", "H2O Connection port: 54321\n", "-------------------------- ---------------------------------------------" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Explore a typical Data Science workflow with H2O and Python\n", "#\n", "# Goal: assist the manager of CitiBike of NYC to load-balance the bicycles\n", "# across the CitiBike network of stations, by predicting the number of bike\n", "# trips taken from the station every day. Use 10 million rows of historical\n", "# data, and eventually add weather data.\n", "\n", "\n", "# Connect to a cluster\n", "h2o.init()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Set this to True if you want to fetch the data directly from S3.\n", "# This is useful if your cluster is running in EC2.\n", "data_source_is_s3 = False\n", "\n", "def mylocate(s):\n", " if data_source_is_s3:\n", " return \"s3n://h2o-public-test-data/\" + s\n", " else:\n", " return h2o.locate(s)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Import and Parse bike data\n", "\n", "Parse Progress: [##################################################] 100%\n", "\n", "Parsed 10,407,546 rows and 15 cols:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
File1/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-07.csv
File2/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-08.csv
File3/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-09.csv
File4/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-10.csv
File5/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-11.csv
File6/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-12.csv
File7/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-01.csv
File8/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-02.csv
File9/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-03.csv
File10/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-04.csv
File11/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-05.csv
File12/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-06.csv
File13/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-07.csv
File14/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-08.csv
" ], "text/plain": [ "------ ---------------------------------------------------------------------\n", "File1 /Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-07.csv\n", "File2 /Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-08.csv\n", "File3 /Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-09.csv\n", "File4 /Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-10.csv\n", "File5 /Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-11.csv\n", "File6 /Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-12.csv\n", "File7 /Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-01.csv\n", "File8 /Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-02.csv\n", "File9 /Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-03.csv\n", "File10 /Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-04.csv\n", "File11 /Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-05.csv\n", "File12 /Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-06.csv\n", "File13 /Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-07.csv\n", "File14 /Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-08.csv\n", "------ ---------------------------------------------------------------------" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Pick either the big or the small demo.\n", "# Big data is 10M rows\n", "small_test = [mylocate(\"bigdata/laptop/citibike-nyc/2013-10.csv\")]\n", "big_test = [mylocate(\"bigdata/laptop/citibike-nyc/2013-07.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-08.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-09.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-10.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-11.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2013-12.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-01.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-02.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-03.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-04.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-05.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-06.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-07.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/2014-08.csv\")]\n", "\n", "# ----------\n", "\n", "# 1- Load data - 1 row per bicycle trip. Has columns showing the start and end\n", "# station, trip duration and trip start time and day. The larger dataset\n", "# totals about 10 million rows\n", "print \"Import and Parse bike data\"\n", "data = h2o.import_frame(path=big_test)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows: 10,407,546 Cols: 16\n", "\n", "Chunk compression summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers1171.5298117 9.1 KB0.0015500536
C11-Byte Integers4786.25 10.0 MB1.7289143
C1N1-Byte Integers (w/o NAs)4786.25 10.0 MB1.7289143
C1S1-Byte Fractions83910.970188 17.5 MB3.042758
C22-Byte Integers261634.20502 108.8 MB18.8909
C2S2-Byte Fractions3144.1056485 12.9 MB2.2460942
C44-Byte Integers2142.7981172 17.9 MB3.1005228
C4S4-Byte Fractions3895.086297 32.4 MB5.625424
C864-bit Integers6808.891213 113.5 MB19.704786
C8D64-bit Reals152319.913704 253.0 MB43.930134
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ -------- -----------------\n", "C0L Constant Integers 117 1.52981 9.1 KB 0.00155005\n", "C1 1-Byte Integers 478 6.25 10.0 MB 1.72891\n", "C1N 1-Byte Integers (w/o NAs) 478 6.25 10.0 MB 1.72891\n", "C1S 1-Byte Fractions 839 10.9702 17.5 MB 3.04276\n", "C2 2-Byte Integers 2616 34.205 108.8 MB 18.8909\n", "C2S 2-Byte Fractions 314 4.10565 12.9 MB 2.24609\n", "C4 4-Byte Integers 214 2.79812 17.9 MB 3.10052\n", "C4S 4-Byte Fractions 389 5.0863 32.4 MB 5.62542\n", "C8 64-bit Integers 680 8.89121 113.5 MB 19.7048\n", "C8D 64-bit Reals 1523 19.9137 253.0 MB 43.9301" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.37:54321 575.9 MB10407546.0478.07648.0
mean 575.9 MB10407546.0478.07648.0
min 575.9 MB10407546.0478.07648.0
max 575.9 MB10407546.0478.07648.0
stddev 0 B0.00.00.0
total 575.9 MB10407546.0478.07648.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- -------- ---------------- ----------------------------- ------------------\n", "172.16.2.37:54321 575.9 MB 1.04075e+07 478 7648\n", "mean 575.9 MB 1.04075e+07 478 7648\n", "min 575.9 MB 1.04075e+07 478 7648\n", "max 575.9 MB 1.04075e+07 478 7648\n", "stddev 0 B 0 0 0\n", "total 575.9 MB 1.04075e+07 478 7648" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Column-by-Column Summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
tripdurationstarttimestoptimestart station idstart station namestart station latitudestart station longitudeend station idend station nameend station latitudeend station longitudebikeidusertypebirth yeargenderDays
typeinttimetimeintenumrealrealintenumrealrealintenumintintint
mins60.01.372662e+121.372662242e+1272.00.040.680342423-74.0171344572.00.040.680342423-74.0171344514529.00.01899.00.015887.0
maxs6250750.01.409554787e+121.409563605e+123002.0339.040.771522-73.95004797593002.0339.040.771522-73.950047975921689.01.01998.02.016314.0
sigma2985.1054053211806578171.711806555707.8355.755989765103.2103042270.01971005087360.0123453320185360.070380844103.2050912060.01973095786330.01243118615981938.805178840.32480738750611.1327849050.563019777794136.647269305
zero_count000056836000551670001247534012485170
missing_count0000000000000124764400
" ], "text/plain": [ " tripduration starttime stoptime start station id start station name start station latitude start station longitude end station id end station name end station latitude end station longitude bikeid usertype birth year gender Days\n", "------------- -------------- --------------- --------------- ------------------ -------------------- ------------------------ ------------------------- ---------------- ------------------ ---------------------- ----------------------- ------------- -------------- ------------ -------------- -------------\n", "type int time time int enum real real int enum real real int enum int int int\n", "mins 60.0 1.372662e+12 1.372662242e+12 72.0 0.0 40.680342423 -74.01713445 72.0 0.0 40.680342423 -74.01713445 14529.0 0.0 1899.0 0.0 15887.0\n", "maxs 6250750.0 1.409554787e+12 1.409563605e+12 3002.0 339.0 40.771522 -73.9500479759 3002.0 339.0 40.771522 -73.9500479759 21689.0 1.0 1998.0 2.0 16314.0\n", "sigma 2985.10540532 11806578171.7 11806555707.8 355.755989765 103.210304227 0.0197100508736 0.0123453320185 360.070380844 103.205091206 0.0197309578633 0.0124311861598 1938.80517884 0.324807387506 11.132784905 0.563019777794 136.647269305\n", "zero_count 0 0 0 0 56836 0 0 0 55167 0 0 0 1247534 0 1248517 0\n", "missing_count 0 0 0 0 0 0 0 0 0 0 0 0 0 1247644 0 0" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# ----------\n", "\n", "# 2- light data munging: group the bike starts per-day, converting the 10M rows\n", "# of trips to about 140,000 station&day combos - predicting the number of trip\n", "# starts per-station-per-day.\n", "\n", "# Convert start time to: Day since the Epoch\n", "startime = data[\"starttime\"]\n", "secsPerDay=1000*60*60*24\n", "data[\"Days\"] = (startime/secsPerDay).floor()\n", "data.describe()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "First 10 rows and first 3 columns: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Daysstart station namebikes
16234Concord St & Bridge St15
16106Cumberland St & Lafayette Ave6
15978DeKalb Ave & Hudson Ave36
16088Allen St & Hester St55
15945Allen St & Rivington St140
16251Clinton St & Grand St79
16123Clinton St & Joralemon St6
15995Clinton St & Tillary St22
16313Greenwich St & N Moore St74
16185Hancock St & Bedford Ave14
" ], "text/plain": [ " Days start station name bikes\n", "------ ----------------------------- -------\n", " 16234 Concord St & Bridge St 15\n", " 16106 Cumberland St & Lafayette Ave 6\n", " 15978 DeKalb Ave & Hudson Ave 36\n", " 16088 Allen St & Hester St 55\n", " 15945 Allen St & Rivington St 140\n", " 16251 Clinton St & Grand St 79\n", " 16123 Clinton St & Joralemon St 6\n", " 15995 Clinton St & Tillary St 22\n", " 16313 Greenwich St & N Moore St 74\n", " 16185 Hancock St & Bedford Ave 14" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Rows: 139,261 Cols: 3\n", "\n", "Chunk compression summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C22-Byte Integers96100.0 822.4 KB100.0
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ --------------- ------- ------------------ -------- -----------------\n", "C2 2-Byte Integers 96 100 822.4 KB 100" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.37:54321 822.4 KB139261.032.096.0
mean 822.4 KB139261.032.096.0
min 822.4 KB139261.032.096.0
max 822.4 KB139261.032.096.0
stddev 0 B0.00.00.0
total 822.4 KB139261.032.096.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- -------- ---------------- ----------------------------- ------------------\n", "172.16.2.37:54321 822.4 KB 139261 32 96\n", "mean 822.4 KB 139261 32 96\n", "min 822.4 KB 139261 32 96\n", "max 822.4 KB 139261 32 96\n", "stddev 0 B 0 0 0\n", "total 822.4 KB 139261 32 96" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Column-by-Column Summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Daysstart station namebikes
typeintenumint
mins15887.00.01.0
maxs16314.0339.0680.0
sigma123.63513389798.5029573264.1243887565
zero_count04280
missing_count000
" ], "text/plain": [ " Days start station name bikes\n", "------------- ------------- -------------------- -------------\n", "type int enum int\n", "mins 15887.0 0.0 1.0\n", "maxs 16314.0 339.0 680.0\n", "sigma 123.635133897 98.50295732 64.1243887565\n", "zero_count 0 428 0\n", "missing_count 0 0 0" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "[139261, 3]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Now do a monster Group-By. Count bike starts per-station per-day. Ends up\n", "# with about 340 stations times 400 days (140,000 rows). This is what we want\n", "# to predict.\n", "group_by_cols = [\"Days\",\"start station name\"]\n", "aggregates = {\"bikes\": [\"count\", 0, \"all\"]}\n", "bpd = data.group_by(cols=group_by_cols, aggregates=aggregates) # Compute bikes-per-day\n", "bpd.show()\n", "bpd.describe()\n", "bpd.dim()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Quantiles of bikes-per-day\n", "First 9 rows and first 2 columns: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
ProbsbikesQuantiles
0.012
0.111
0.2526
0.33335
0.558
0.66789
0.75107
0.9157
0.99291
" ], "text/plain": [ " Probs bikesQuantiles\n", "------- ----------------\n", " 0.01 2\n", " 0.1 11\n", " 0.25 26\n", " 0.333 35\n", " 0.5 58\n", " 0.667 89\n", " 0.75 107\n", " 0.9 157\n", " 0.99 291" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Quantiles: the data is fairly unbalanced; some station/day combos are wildly\n", "# more popular than others.\n", "print \"Quantiles of bikes-per-day\"\n", "bpd[\"bikes\"].quantile().show()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Bikes-Per-Day\n", "Rows: 139,261 Cols: 5\n", "\n", "Chunk compression summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C1N1-Byte Integers (w/o NAs)6440.0 276.2 KB25.145071
C22-Byte Integers9660.000004 822.4 KB74.85493
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ -------- -----------------\n", "C1N 1-Byte Integers (w/o NAs) 64 40 276.2 KB 25.1451\n", "C2 2-Byte Integers 96 60 822.4 KB 74.8549" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.37:54321 1.1 MB139261.032.0160.0
mean 1.1 MB139261.032.0160.0
min 1.1 MB139261.032.0160.0
max 1.1 MB139261.032.0160.0
stddev 0 B0.00.00.0
total 1.1 MB139261.032.0160.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- ------ ---------------- ----------------------------- ------------------\n", "172.16.2.37:54321 1.1 MB 139261 32 160\n", "mean 1.1 MB 139261 32 160\n", "min 1.1 MB 139261 32 160\n", "max 1.1 MB 139261 32 160\n", "stddev 0 B 0 0 0\n", "total 1.1 MB 139261 32 160" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Column-by-Column Summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Daysstart station namebikesMonthDayOfWeek
typeintenumintenumenum
mins15887.00.01.00.00.0
maxs16314.0339.0680.011.06.0
sigma123.63513389798.5029573264.12438875653.203731002162.00302100015
zero_count04280994919880
missing_count00000
" ], "text/plain": [ " Days start station name bikes Month DayOfWeek\n", "------------- ------------- -------------------- ------------- ------------- -------------\n", "type int enum int enum enum\n", "mins 15887.0 0.0 1.0 0.0 0.0\n", "maxs 16314.0 339.0 680.0 11.0 6.0\n", "sigma 123.635133897 98.50295732 64.1243887565 3.20373100216 2.00302100015\n", "zero_count 0 428 0 9949 19880\n", "missing_count 0 0 0 0 0" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# A little feature engineering\n", "# Add in month-of-year (seasonality; fewer bike rides in winter than summer)\n", "secs = bpd[\"Days\"]*secsPerDay\n", "bpd[\"Month\"] = secs.month().asfactor()\n", "# Add in day-of-week (work-week; more bike rides on Sunday than Monday)\n", "bpd[\"DayOfWeek\"] = secs.dayOfWeek()\n", "print \"Bikes-Per-Day\"\n", "bpd.describe()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# ----------\n", "# 3- Fit a model on train; using test as validation\n", "\n", "# Function for doing class test/train/holdout split\n", "def split_fit_predict(data):\n", " global gbm0,drf0,glm0,dl0\n", " # Classic Test/Train split\n", " r = data['Days'].runif() # Random UNIForm numbers, one per row\n", " train = data[ r < 0.6]\n", " test = data[(0.6 <= r) & (r < 0.9)]\n", " hold = data[ 0.9 <= r ]\n", " print \"Training data has\",train.ncol(),\"columns and\",train.nrow(),\"rows, test has\",test.nrow(),\"rows, holdout has\",hold.nrow()\n", " \n", " # Run GBM\n", " s = time.time()\n", " gbm0 = h2o.gbm(x =train.drop(\"bikes\"),\n", " y =train [\"bikes\"],\n", " validation_x=test .drop(\"bikes\"),\n", " validation_y=test [\"bikes\"],\n", " ntrees=500, # 500 works well\n", " max_depth=6,\n", " learn_rate=0.1)\n", " gbm_elapsed = time.time() - s\n", "\n", " # Run DRF\n", " s = time.time()\n", " drf0 = h2o.random_forest(x =train.drop(\"bikes\"),\n", " y =train [\"bikes\"],\n", " validation_x=test .drop(\"bikes\"),\n", " validation_y=test [\"bikes\"],\n", " ntrees=250,\n", " max_depth=30)\n", " drf_elapsed = time.time() - s \n", " \n", " \n", " # Run GLM\n", " s = time.time()\n", " glm0 = h2o.glm(x =train.drop(\"bikes\"),\n", " y =train [\"bikes\"],\n", " validation_x=test .drop(\"bikes\"),\n", " validation_y=test [\"bikes\"],\n", " Lambda=[1e-5],\n", " family=\"poisson\")\n", " glm_elapsed = time.time() - s\n", " \n", " # Run DL\n", " s = time.time()\n", " dl0 = h2o.deeplearning(x =train.drop(\"bikes\"),\n", " y =train [\"bikes\"],\n", " validation_x=test .drop(\"bikes\"),\n", " validation_y=test [\"bikes\"],\n", " hidden=[50,50,50,50],\n", " epochs=50)\n", " dl_elapsed = time.time() - s\n", " \n", " # ----------\n", " # 4- Score on holdout set & report\n", " train_r2_gbm = gbm0.model_performance(train).r2()\n", " test_r2_gbm = gbm0.model_performance(test ).r2()\n", " hold_r2_gbm = gbm0.model_performance(hold ).r2()\n", "# print \"GBM R2 TRAIN=\",train_r2_gbm,\", R2 TEST=\",test_r2_gbm,\", R2 HOLDOUT=\",hold_r2_gbm\n", " \n", " train_r2_drf = drf0.model_performance(train).r2()\n", " test_r2_drf = drf0.model_performance(test ).r2()\n", " hold_r2_drf = drf0.model_performance(hold ).r2()\n", "# print \"DRF R2 TRAIN=\",train_r2_drf,\", R2 TEST=\",test_r2_drf,\", R2 HOLDOUT=\",hold_r2_drf\n", " \n", " train_r2_glm = glm0.model_performance(train).r2()\n", " test_r2_glm = glm0.model_performance(test ).r2()\n", " hold_r2_glm = glm0.model_performance(hold ).r2()\n", "# print \"GLM R2 TRAIN=\",train_r2_glm,\", R2 TEST=\",test_r2_glm,\", R2 HOLDOUT=\",hold_r2_glm\n", " \n", " train_r2_dl = dl0.model_performance(train).r2()\n", " test_r2_dl = dl0.model_performance(test ).r2()\n", " hold_r2_dl = dl0.model_performance(hold ).r2()\n", "# print \" DL R2 TRAIN=\",train_r2_dl,\", R2 TEST=\",test_r2_dl,\", R2 HOLDOUT=\",hold_r2_dl\n", " \n", " # make a pretty HTML table printout of the results\n", "\n", " header = [\"Model\", \"R2 TRAIN\", \"R2 TEST\", \"R2 HOLDOUT\", \"Model Training Time (s)\"]\n", " table = [\n", " [\"GBM\", train_r2_gbm, test_r2_gbm, hold_r2_gbm, round(gbm_elapsed,3)],\n", " [\"DRF\", train_r2_drf, test_r2_drf, hold_r2_drf, round(drf_elapsed,3)],\n", " [\"GLM\", train_r2_glm, test_r2_glm, hold_r2_glm, round(glm_elapsed,3)],\n", " [\"DL \", train_r2_dl, test_r2_dl, hold_r2_dl , round(dl_elapsed,3) ],\n", " ]\n", " h2o.H2ODisplay(table,header)\n", " # --------------" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training data has 5 columns and 83754 rows, test has 41608 rows, holdout has 13899\n", "\n", "gbm Model Build Progress: [##################################################] 100%\n", "\n", "drf Model Build Progress: [##################################################] 100%\n", "\n", "glm Model Build Progress: [##################################################] 100%\n", "\n", "deeplearning Model Build Progress: [##################################################] 100%\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
ModelR2 TRAINR2 TESTR2 HOLDOUTModel Training Time (s)
GBM0.9675963065470.9211635523130.92451128396321.313
DRF0.8494496925690.8195921855290.82433135911727.972
GLM0.7840899763970.7841836894670.7887819382950.446
DL 0.903679866310.880163582770.88213960494854.633
" ], "text/plain": [ "Model R2 TRAIN R2 TEST R2 HOLDOUT Model Training Time (s)\n", "------- ---------- --------- ------------ -------------------------\n", "GBM 0.967596 0.921164 0.924511 21.313\n", "DRF 0.84945 0.819592 0.824331 27.972\n", "GLM 0.78409 0.784184 0.788782 0.446\n", "DL 0.90368 0.880164 0.88214 54.633" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Split the data (into test & train), fit some models and predict on the holdout data\n", "split_fit_predict(bpd)\n", "# Here we see an r^2 of 0.91 for GBM, and 0.71 for GLM. This means given just\n", "# the station, the month, and the day-of-week we can predict 90% of the\n", "# variance of the bike-trip-starts." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Parse Progress: [##################################################] 100%\n", "\n", "Parsed 17,520 rows and 50 cols:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "
File1/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv
File2/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv
" ], "text/plain": [ "----- ----------------------------------------------------------------------------------------------\n", "File1 /Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv\n", "File2 /Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv\n", "----- ----------------------------------------------------------------------------------------------" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Rows: 17,520 Cols: 50\n", "\n", "Chunk compression summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers1076.294118 8.4 KB0.7889721
C0DConstant Reals43625.647058 34.1 KB3.2148771
CXISparse Integers171.0 1.5 KB0.13991351
C11-Byte Integers34620.352942 197.4 KB18.634672
C1N1-Byte Integers (w/o NAs)21412.588236 122.3 KB11.544063
C1S1-Byte Fractions21412.588236 125.3 KB11.822968
C2S2-Byte Fractions19611.529412 214.5 KB20.242111
C4S4-Byte Fractions17010.0 356.1 KB33.612423
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ -------- -----------------\n", "C0L Constant Integers 107 6.29412 8.4 KB 0.788972\n", "C0D Constant Reals 436 25.6471 34.1 KB 3.21488\n", "CXI Sparse Integers 17 1 1.5 KB 0.139914\n", "C1 1-Byte Integers 346 20.3529 197.4 KB 18.6347\n", "C1N 1-Byte Integers (w/o NAs) 214 12.5882 122.3 KB 11.5441\n", "C1S 1-Byte Fractions 214 12.5882 125.3 KB 11.823\n", "C2S 2-Byte Fractions 196 11.5294 214.5 KB 20.2421\n", "C4S 4-Byte Fractions 170 10 356.1 KB 33.6124" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.37:54321 1.0 MB17520.034.01700.0
mean 1.0 MB17520.034.01700.0
min 1.0 MB17520.034.01700.0
max 1.0 MB17520.034.01700.0
stddev 0 B0.00.00.0
total 1.0 MB17520.034.01700.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- ------ ---------------- ----------------------------- ------------------\n", "172.16.2.37:54321 1.0 MB 17520 34 1700\n", "mean 1.0 MB 17520 34 1700\n", "min 1.0 MB 17520 34 1700\n", "max 1.0 MB 17520 34 1700\n", "stddev 0 B 0 0 0\n", "total 1.0 MB 17520 34 1700" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Column-by-Column Summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Year LocalMonth LocalDay LocalHour LocalYear UTCMonth UTCDay UTCHour UTCCavok ReportedCloud Ceiling (m)Cloud Cover FractionCloud Cover Fraction 1Cloud Cover Fraction 2Cloud Cover Fraction 3Cloud Cover Fraction 4Cloud Cover Fraction 5Cloud Cover Fraction 6Cloud Height (m) 1Cloud Height (m) 2Cloud Height (m) 3Cloud Height (m) 4Cloud Height (m) 5Cloud Height (m) 6Dew Point (C)Humidity FractionPrecipitation One Hour (mm)Pressure Altimeter (mbar)Pressure Sea Level (mbar)Pressure Station (mbar)Snow Depth (cm)Temperature (C)Visibility (km)Weather Code 1Weather Code 1/ DescriptionWeather Code 2Weather Code 2/ DescriptionWeather Code 3Weather Code 3/ DescriptionWeather Code 4Weather Code 4/ DescriptionWeather Code 5Weather Code 5/ DescriptionWeather Code 6Weather Code 6/ DescriptionWeather Code Most Severe / Icon CodeWeather Code Most SevereWeather Code Most Severe / DescriptionWind Direction (degrees)Wind Gust (m/s)Wind Speed (m/s)
typeintintintintintintintintintrealrealrealrealrealintintintrealrealrealintintintrealrealrealrealintintintrealrealintenumintenumintenumintenumintenumintenumintintenumintrealreal
mins2013.01.01.00.02013.01.01.00.00.061.00.00.00.250.5NaNNaNNaN60.96213.36365.76NaNNaNNaN-26.70.12510.0983.2949NaNNaNNaN-15.60.0011.00.01.00.01.00.01.00.01.00.03.00.00.01.00.010.07.20.0
maxs2014.012.031.023.02015.012.031.023.00.03657.61.01.01.01.0NaNNaNNaN3657.59993657.59993657.5999NaNNaNNaN24.41.026.9241042.2113NaNNaNNaN36.116.093460.011.060.010.036.07.027.04.027.02.03.00.016.060.011.0360.020.5810.8
sigma0.5000142700173.447949723858.796498048526.922384111880.5005844117163.447824054588.795614888686.922301652030.0995.3398569660.4627208309930.427705697080.1971556903670.0861015598104NaNNaNNaN962.743095854916.73861349887.215847511NaNNaNNaN10.97312820970.1857920118662.562151291797.46451697179NaNNaNNaN10.03967395313.698936230335.704865769832.478147086636.133862539121.839762353355.805532863641.289675536983.123408442611.2821643696.152235366110.602079728940.00.04.073860627025.704865769832.47814708663106.3500000311.815118711151.61469790524
zero_count00073000073017455087588758000000000002680501000026900170300130200120214980017002768
missing_count00000000651078037537514682165351752017520175209103146831653517520175201752067671566036017520175201752067412149801498016477164771718117181174331743317504175041751817518014980149809382143811283
" ], "text/plain": [ " Year Local Month Local Day Local Hour Local Year UTC Month UTC Day UTC Hour UTC Cavok Reported Cloud Ceiling (m) Cloud Cover Fraction Cloud Cover Fraction 1 Cloud Cover Fraction 2 Cloud Cover Fraction 3 Cloud Cover Fraction 4 Cloud Cover Fraction 5 Cloud Cover Fraction 6 Cloud Height (m) 1 Cloud Height (m) 2 Cloud Height (m) 3 Cloud Height (m) 4 Cloud Height (m) 5 Cloud Height (m) 6 Dew Point (C) Humidity Fraction Precipitation One Hour (mm) Pressure Altimeter (mbar) Pressure Sea Level (mbar) Pressure Station (mbar) Snow Depth (cm) Temperature (C) Visibility (km) Weather Code 1 Weather Code 1/ Description Weather Code 2 Weather Code 2/ Description Weather Code 3 Weather Code 3/ Description Weather Code 4 Weather Code 4/ Description Weather Code 5 Weather Code 5/ Description Weather Code 6 Weather Code 6/ Description Weather Code Most Severe / Icon Code Weather Code Most Severe Weather Code Most Severe / Description Wind Direction (degrees) Wind Gust (m/s) Wind Speed (m/s)\n", "------------- -------------- ------------- ------------- ------------- -------------- ------------- ------------- ------------- ---------------- ------------------- ---------------------- ------------------------ ------------------------ ------------------------ ------------------------ ------------------------ ------------------------ -------------------- -------------------- -------------------- -------------------- -------------------- -------------------- --------------- ------------------- ----------------------------- --------------------------- --------------------------- ------------------------- ----------------- ----------------- ----------------- ---------------- ----------------------------- ---------------- ----------------------------- ---------------- ----------------------------- ---------------- ----------------------------- ---------------- ----------------------------- ---------------- ----------------------------- -------------------------------------- -------------------------- ---------------------------------------- -------------------------- ----------------- ------------------\n", "type int int int int int int int int int real real real real real int int int real real real int int int real real real real int int int real real int enum int enum int enum int enum int enum int enum int int enum int real real\n", "mins 2013.0 1.0 1.0 0.0 2013.0 1.0 1.0 0.0 0.0 61.0 0.0 0.0 0.25 0.5 NaN NaN NaN 60.96 213.36 365.76 NaN NaN NaN -26.7 0.1251 0.0 983.2949 NaN NaN NaN -15.6 0.001 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 3.0 0.0 0.0 1.0 0.0 10.0 7.2 0.0\n", "maxs 2014.0 12.0 31.0 23.0 2015.0 12.0 31.0 23.0 0.0 3657.6 1.0 1.0 1.0 1.0 NaN NaN NaN 3657.5999 3657.5999 3657.5999 NaN NaN NaN 24.4 1.0 26.924 1042.2113 NaN NaN NaN 36.1 16.0934 60.0 11.0 60.0 10.0 36.0 7.0 27.0 4.0 27.0 2.0 3.0 0.0 16.0 60.0 11.0 360.0 20.58 10.8\n", "sigma 0.500014270017 3.44794972385 8.79649804852 6.92238411188 0.500584411716 3.44782405458 8.79561488868 6.92230165203 0.0 995.339856966 0.462720830993 0.42770569708 0.197155690367 0.0861015598104 NaN NaN NaN 962.743095854 916.73861349 887.215847511 NaN NaN NaN 10.9731282097 0.185792011866 2.56215129179 7.46451697179 NaN NaN NaN 10.0396739531 3.69893623033 5.70486576983 2.47814708663 6.13386253912 1.83976235335 5.80553286364 1.28967553698 3.12340844261 1.282164369 6.15223536611 0.60207972894 0.0 0.0 4.07386062702 5.70486576983 2.47814708663 106.350000031 1.81511871115 1.61469790524\n", "zero_count 0 0 0 730 0 0 0 730 17455 0 8758 8758 0 0 0 0 0 0 0 0 0 0 0 268 0 501 0 0 0 0 269 0 0 17 0 30 0 13 0 20 0 12 0 2 14980 0 17 0 0 2768\n", "missing_count 0 0 0 0 0 0 0 0 65 10780 375 375 14682 16535 17520 17520 17520 9103 14683 16535 17520 17520 17520 67 67 15660 360 17520 17520 17520 67 412 14980 14980 16477 16477 17181 17181 17433 17433 17504 17504 17518 17518 0 14980 14980 9382 14381 1283" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# ----------\n", "# 5- Now lets add some weather\n", "# Load weather data\n", "wthr1 = h2o.import_frame(path=[mylocate(\"bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv\"),\n", " mylocate(\"bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv\")])\n", "# Peek at the data\n", "wthr1.describe()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows: 17,520 Cols: 9\n", "\n", "Chunk compression summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers4615.0326805 3.6 KB1.780005
C11-Byte Integers3411.111112 19.4 KB9.592678
C1N1-Byte Integers (w/o NAs)9029.411766 51.5 KB25.494701
C1S1-Byte Fractions4213.725491 24.0 KB11.894592
C2S2-Byte Fractions9430.718956 103.4 KB51.238026
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ -------- -----------------\n", "C0L Constant Integers 46 15.0327 3.6 KB 1.78001\n", "C1 1-Byte Integers 34 11.1111 19.4 KB 9.59268\n", "C1N 1-Byte Integers (w/o NAs) 90 29.4118 51.5 KB 25.4947\n", "C1S 1-Byte Fractions 42 13.7255 24.0 KB 11.8946\n", "C2S 2-Byte Fractions 94 30.719 103.4 KB 51.238" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.37:54321 201.9 KB17520.034.0306.0
mean 201.9 KB17520.034.0306.0
min 201.9 KB17520.034.0306.0
max 201.9 KB17520.034.0306.0
stddev 0 B0.00.00.0
total 201.9 KB17520.034.0306.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- -------- ---------------- ----------------------------- ------------------\n", "172.16.2.37:54321 201.9 KB 17520 34 306\n", "mean 201.9 KB 17520 34 306\n", "min 201.9 KB 17520 34 306\n", "max 201.9 KB 17520 34 306\n", "stddev 0 B 0 0 0\n", "total 201.9 KB 17520 34 306" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Column-by-Column Summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Year LocalMonth LocalDay LocalHour LocalDew Point (C)Humidity FractionRain (mm)Temperature (C)WC1
typeintintintintrealrealrealrealenum
mins2013.01.01.00.0-26.70.12510.0-15.60.0
maxs2014.012.031.023.024.41.026.92436.111.0
sigma0.5000142700173.447949723858.796498048526.9223841118810.97312820970.1857920118662.5621512917910.03967395312.47814708663
zero_count000730268050126917
missing_count00006767156606714980
" ], "text/plain": [ " Year Local Month Local Day Local Hour Local Dew Point (C) Humidity Fraction Rain (mm) Temperature (C) WC1\n", "------------- -------------- ------------- ------------- ------------- --------------- ------------------- ------------- ----------------- -------------\n", "type int int int int real real real real enum\n", "mins 2013.0 1.0 1.0 0.0 -26.7 0.1251 0.0 -15.6 0.0\n", "maxs 2014.0 12.0 31.0 23.0 24.4 1.0 26.924 36.1 11.0\n", "sigma 0.500014270017 3.44794972385 8.79649804852 6.92238411188 10.9731282097 0.185792011866 2.56215129179 10.0396739531 2.47814708663\n", "zero_count 0 0 0 730 268 0 501 269 17\n", "missing_count 0 0 0 0 67 67 15660 67 14980" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Lots of columns in there! Lets plan on converting to time-since-epoch to do\n", "# a 'join' with the bike data, plus gather weather info that might affect\n", "# cyclists - rain, snow, temperature. Alas, drop the \"snow\" column since it's\n", "# all NA's. Also add in dew point and humidity just in case. Slice out just\n", "# the columns of interest and drop the rest.\n", "wthr2 = wthr1[[\"Year Local\",\"Month Local\",\"Day Local\",\"Hour Local\",\"Dew Point (C)\",\"Humidity Fraction\",\"Precipitation One Hour (mm)\",\"Temperature (C)\",\"Weather Code 1/ Description\"]]\n", "\n", "wthr2.setName(wthr2.index(\"Precipitation One Hour (mm)\"), \"Rain (mm)\")\n", "wthr2.setName(wthr2.index(\"Weather Code 1/ Description\"), \"WC1\")\n", "wthr2.describe()\n", "# Much better! " ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Filter down to the weather at Noon\n", "wthr3 = wthr2[ wthr2[\"Hour Local\"]==12 ]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows: 730 Cols: 11\n", "\n", "Chunk compression summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers8021.390373 6.3 KB11.955688
C0DConstant Reals133.4759357 1.0 KB1.9427994
C11-Byte Integers308.021391 2.6 KB5.0176535
C1N1-Byte Integers (w/o NAs)5614.973262 4.9 KB9.375875
C1S1-Byte Fractions349.090909 3.5 KB6.698922
C2S2-Byte Fractions349.090909 4.2 KB8.062618
C8D64-bit Reals12733.95722 29.8 KB56.946445
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ ------- -----------------\n", "C0L Constant Integers 80 21.3904 6.3 KB 11.9557\n", "C0D Constant Reals 13 3.47594 1.0 KB 1.9428\n", "C1 1-Byte Integers 30 8.02139 2.6 KB 5.01765\n", "C1N 1-Byte Integers (w/o NAs) 56 14.9733 4.9 KB 9.37588\n", "C1S 1-Byte Fractions 34 9.09091 3.5 KB 6.69892\n", "C2S 2-Byte Fractions 34 9.09091 4.2 KB 8.06262\n", "C8D 64-bit Reals 127 33.9572 29.8 KB 56.9464" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.37:54321 52.3 KB730.034.0374.0
mean 52.3 KB730.034.0374.0
min 52.3 KB730.034.0374.0
max 52.3 KB730.034.0374.0
stddev 0 B0.00.00.0
total 52.3 KB730.034.0374.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- ------- ---------------- ----------------------------- ------------------\n", "172.16.2.37:54321 52.3 KB 730 34 374\n", "mean 52.3 KB 730 34 374\n", "min 52.3 KB 730 34 374\n", "max 52.3 KB 730 34 374\n", "stddev 0 B 0 0 0\n", "total 52.3 KB 730 34 374" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Column-by-Column Summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Year LocalMonth LocalDay LocalHour LocalDew Point (C)Humidity FractionRain (mm)Temperature (C)WC1msecDays
typeintintintintrealrealrealrealenumintint
mins2013.01.01.012.0-26.70.17230.0-13.90.01.3570704e+1215706.0
maxs2014.012.031.012.023.31.012.44634.410.01.420056e+1216435.0
sigma0.5003428180043.450215293078.802278027010.011.10629647250.1799450279232.3606424861510.39898551492.7467472612318219740080.4210.877136425
zero_count0000140157100
missing_count000033660362000
" ], "text/plain": [ " Year Local Month Local Day Local Hour Local Dew Point (C) Humidity Fraction Rain (mm) Temperature (C) WC1 msec Days\n", "------------- -------------- ------------- ------------- ------------ --------------- ------------------- ------------- ----------------- ------------- ------------- -------------\n", "type int int int int real real real real enum int int\n", "mins 2013.0 1.0 1.0 12.0 -26.7 0.1723 0.0 -13.9 0.0 1.3570704e+12 15706.0\n", "maxs 2014.0 12.0 31.0 12.0 23.3 1.0 12.446 34.4 10.0 1.420056e+12 16435.0\n", "sigma 0.500342818004 3.45021529307 8.80227802701 0.0 11.1062964725 0.179945027923 2.36064248615 10.3989855149 2.74674726123 18219740080.4 210.877136425\n", "zero_count 0 0 0 0 14 0 15 7 1 0 0\n", "missing_count 0 0 0 0 3 3 660 3 620 0 0" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Lets now get Days since the epoch... we'll convert year/month/day into Epoch\n", "# time, and then back to Epoch days. Need zero-based month and days, but have\n", "# 1-based.\n", "wthr3[\"msec\"] = h2o.H2OFrame.mktime(year=wthr3[\"Year Local\"], month=wthr3[\"Month Local\"]-1, day=wthr3[\"Day Local\"]-1, hour=wthr3[\"Hour Local\"])\n", "secsPerDay=1000*60*60*24\n", "wthr3[\"Days\"] = (wthr3[\"msec\"]/secsPerDay).floor()\n", "wthr3.describe()\n", "# msec looks sane (numbers like 1.3e12 are in the correct range for msec since\n", "# 1970). Epoch Days matches closely with the epoch day numbers from the\n", "# CitiBike dataset. " ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Lets drop off the extra time columns to make a easy-to-handle dataset.\n", "wthr4 = wthr3.drop(\"Year Local\").drop(\"Month Local\").drop(\"Day Local\").drop(\"Hour Local\").drop(\"msec\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Also, most rain numbers are missing - lets assume those are zero rain days\n", "rain = wthr4[\"Rain (mm)\"]\n", "rain[ rain.isna() ] = 0" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Merge Daily Weather with Bikes-Per-Day\n", "Rows: 139,261 Cols: 10\n", "\n", "Chunk compression summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C11-Byte Integers3210.0 138.1 KB2.4677303
C1N1-Byte Integers (w/o NAs)6420.0 276.2 KB4.9354606
C22-Byte Integers9630.000002 822.4 KB14.692484
C8D64-bit Reals12840.0 4.3 MB77.90433
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ -------- -----------------\n", "C1 1-Byte Integers 32 10 138.1 KB 2.46773\n", "C1N 1-Byte Integers (w/o NAs) 64 20 276.2 KB 4.93546\n", "C2 2-Byte Integers 96 30 822.4 KB 14.6925\n", "C8D 64-bit Reals 128 40 4.3 MB 77.9043" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.37:54321 5.5 MB139261.032.0320.0
mean 5.5 MB139261.032.0320.0
min 5.5 MB139261.032.0320.0
max 5.5 MB139261.032.0320.0
stddev 0 B0.00.00.0
total 5.5 MB139261.032.0320.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- ------ ---------------- ----------------------------- ------------------\n", "172.16.2.37:54321 5.5 MB 139261 32 320\n", "mean 5.5 MB 139261 32 320\n", "min 5.5 MB 139261 32 320\n", "max 5.5 MB 139261 32 320\n", "stddev 0 B 0 0 0\n", "total 5.5 MB 139261 32 320" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Column-by-Column Summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Daysstart station namebikesMonthDayOfWeekHumidity FractionRain (mm)Temperature (C)WC1Dew Point (C)
typeintenumintenumenumrealrealrealenumreal
mins15887.00.01.00.00.00.17230.0-13.90.0-26.7
maxs16314.0339.0680.011.06.01.08.38234.410.023.3
sigma123.63513389798.5029573264.12438875653.203731002162.003021000150.1784089386641.7677196081310.94545119612.96270960911.7308194576
zero_count042809949198800290915983241954
missing_count00000981128246981119130981
" ], "text/plain": [ " Days start station name bikes Month DayOfWeek Humidity Fraction Rain (mm) Temperature (C) WC1 Dew Point (C)\n", "------------- ------------- -------------------- ------------- ------------- ------------- ------------------- ------------- ----------------- ----------- ---------------\n", "type int enum int enum enum real real real enum real\n", "mins 15887.0 0.0 1.0 0.0 0.0 0.1723 0.0 -13.9 0.0 -26.7\n", "maxs 16314.0 339.0 680.0 11.0 6.0 1.0 8.382 34.4 10.0 23.3\n", "sigma 123.635133897 98.50295732 64.1243887565 3.20373100216 2.00302100015 0.178408938664 1.76771960813 10.9454511961 2.962709609 11.7308194576\n", "zero_count 0 428 0 9949 19880 0 2909 1598 324 1954\n", "missing_count 0 0 0 0 0 981 128246 981 119130 981" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "First 10 rows and first 10 columns: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Daysstart station namebikesMonthDayOfWeekHumidity FractionRain (mm)Temperature (C)WC1Dew Point (C)
16234Concord St & Bridge St156Thu0.9340.50820.0mist18.900000000000002
16106Cumberland St & Lafayette Ave62Tue0.92280000000000010.0mist-1.1
15978DeKalb Ave & Hudson Ave369Sun0.4688000000000000520.08.3
16088Allen St & Hester St551Fri1.01.01.0
15945Allen St & Rivington St1408Tue0.56810.028.3light rain18.900000000000002
16251Clinton St & Grand St796Sun0.527500000000000127.20000000000000316.7
16123Clinton St & Joralemon St62Fri0.31419.4-6.7
15995Clinton St & Tillary St2210Wed0.676520.614.4
16313Greenwich St & N Moore St748Sat0.628728.90000000000000221.1
16185Hancock St & Bedford Ave144Thu0.209215.0-7.2
" ], "text/plain": [ " Days start station name bikes Month DayOfWeek Humidity Fraction Rain (mm) Temperature (C) WC1 Dew Point (C)\n", "------ ----------------------------- ------- ------- ----------- ------------------- ----------- ----------------- ---------- ---------------\n", " 16234 Concord St & Bridge St 15 6 Thu 0.934 0.508 20 mist 18.9\n", " 16106 Cumberland St & Lafayette Ave 6 2 Tue 0.9228 0 mist -1.1\n", " 15978 DeKalb Ave & Hudson Ave 36 9 Sun 0.4688 20 8.3\n", " 16088 Allen St & Hester St 55 1 Fri 1 1 1\n", " 15945 Allen St & Rivington St 140 8 Tue 0.5681 0.0 28.3 light rain 18.9\n", " 16251 Clinton St & Grand St 79 6 Sun 0.5275 27.2 16.7\n", " 16123 Clinton St & Joralemon St 6 2 Fri 0.3141 9.4 -6.7\n", " 15995 Clinton St & Tillary St 22 10 Wed 0.6765 20.6 14.4\n", " 16313 Greenwich St & N Moore St 74 8 Sat 0.6287 28.9 21.1\n", " 16185 Hancock St & Bedford Ave 14 4 Thu 0.2092 15 -7.2" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# ----------\n", "# 6 - Join the weather data-per-day to the bike-starts-per-day\n", "print \"Merge Daily Weather with Bikes-Per-Day\"\n", "bpd_with_weather = bpd.merge(wthr4,allLeft=True,allRite=False)\n", "bpd_with_weather.describe()\n", "bpd_with_weather.show()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training data has 10 columns and 83357 rows, test has 41881 rows, holdout has 14023\n", "\n", "gbm Model Build Progress: [##################################################] 100%\n", "\n", "drf Model Build Progress: [##################################################] 100%\n", "\n", "glm Model Build Progress: [##################################################] 100%\n", "\n", "deeplearning Model Build Progress: [##################################################] 100%\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
ModelR2 TRAINR2 TESTR2 HOLDOUTModel Training Time (s)
GBM0.9666467969490.9250593276150.92448937749228.364
DRF0.8963565800510.8462216072460.849226966424146.965
GLM0.900334018950.8740899442880.8886669837740.245
DL 0.9498035236190.9218817173440.92363067534860.212
" ], "text/plain": [ "Model R2 TRAIN R2 TEST R2 HOLDOUT Model Training Time (s)\n", "------- ---------- --------- ------------ -------------------------\n", "GBM 0.966647 0.925059 0.924489 28.364\n", "DRF 0.896357 0.846222 0.849227 146.965\n", "GLM 0.900334 0.87409 0.888667 0.245\n", "DL 0.949804 0.921882 0.923631 60.212" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# 7 - Test/Train split again, model build again, this time with weather\n", "split_fit_predict(bpd_with_weather)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }