{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import h2o" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
H2O cluster uptime: 17 seconds 548 milliseconds
H2O cluster version: 3.1.0.99999
H2O cluster name: anqi_fu
H2O cluster total nodes: 1
H2O cluster total memory: 1.78 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: True
H2O Connection ip: 127.0.0.1
H2O Connection port: 54321
" ], "text/plain": [ "-------------------------- ---------------------------\n", "H2O cluster uptime: 17 seconds 548 milliseconds\n", "H2O cluster version: 3.1.0.99999\n", "H2O cluster name: anqi_fu\n", "H2O cluster total nodes: 1\n", "H2O cluster total memory: 1.78 GB\n", "H2O cluster total cores: 8\n", "H2O cluster allowed cores: 8\n", "H2O cluster healthy: True\n", "H2O Connection ip: 127.0.0.1\n", "H2O Connection port: 54321\n", "-------------------------- ---------------------------" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Connect to a cluster\n", "h2o.init()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Import and Parse weather data\n", "\n", "Parse Progress: [##################################################] 100%\n", "Imported /Users/anqi_fu/Documents/workspace/h2o-3/smalldata/chicago/chicagoAllWeather.csv . Parsed 5,162 rows and 7 cols\n", "Rows: 5,162 Cols: 7\n", "\n", "Chunk compression summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C1N1-Byte Integers (w/o NAs)228.57143 10.2 KB11.221008
C1S1-Byte Fractions457.14286 20.5 KB22.510675
CStrString114.285715 60.3 KB66.26832
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ ------- -----------------\n", "C1N 1-Byte Integers (w/o NAs) 2 28.5714 10.2 KB 11.221\n", "C1S 1-Byte Fractions 4 57.1429 20.5 KB 22.5107\n", "CStr String 1 14.2857 60.3 KB 66.2683" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.17:54321 91.0 KB5162.01.07.0
mean 91.0 KB5162.01.07.0
min 91.0 KB5162.01.07.0
max 91.0 KB5162.01.07.0
stddev 0 B0.00.00.0
total 91.0 KB5162.01.07.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- ------- ---------------- ----------------------------- ------------------\n", "172.16.2.17:54321 91.0 KB 5162 1 7\n", "mean 91.0 KB 5162 1 7\n", "min 91.0 KB 5162 1 7\n", "max 91.0 KB 5162 1 7\n", "stddev 0 B 0 0 0\n", "total 91.0 KB 5162 1 7" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Column-by-Column Summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
datemonthdayyearmaxTempmeanTempminTemp
typestringintintintintintint
minsNaN1.01.02001.0-2.0-9.0-18.0
maxsNaN12.031.02015.0103.093.082.0
sigmaNaN3.469051716948.798951739974.077340905721.482977723719.930239926619.0207297123
zero_count00000216
missing_count0000131313
" ], "text/plain": [ " date month day year maxTemp meanTemp minTemp\n", "------------- ------ ------------- ------------- ------------ ------------- ------------- -------------\n", "type string int int int int int int\n", "mins NaN 1.0 1.0 2001.0 -2.0 -9.0 -18.0\n", "maxs NaN 12.0 31.0 2015.0 103.0 93.0 82.0\n", "sigma NaN 3.46905171694 8.79895173997 4.0773409057 21.4829777237 19.9302399266 19.0207297123\n", "zero_count 0 0 0 0 0 2 16\n", "missing_count 0 0 0 0 13 13 13" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Import and Parse census data\n", "\n", "Parse Progress: [##################################################] 100%\n", "Imported /Users/anqi_fu/Documents/workspace/h2o-3/smalldata/chicago/chicagoCensus.csv . Parsed 79 rows and 9 cols\n", "Rows: 79 Cols: 9\n", "\n", "Chunk compression summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C11-Byte Integers222.222223 294 B9.312638
C1S1-Byte Fractions111.111112 163 B5.1631293
C2S2-Byte Fractions444.444447 968 B30.662022
C44-Byte Integers111.111112 384 B12.163446
CStrString111.111112 1.3 KB42.698765
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ---------------- ------- ------------------ ------ -----------------\n", "C1 1-Byte Integers 2 22.2222 294 B 9.31264\n", "C1S 1-Byte Fractions 1 11.1111 163 B 5.16313\n", "C2S 2-Byte Fractions 4 44.4444 968 B 30.662\n", "C4 4-Byte Integers 1 11.1111 384 B 12.1634\n", "CStr String 1 11.1111 1.3 KB 42.6988" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.17:54321 3.1 KB79.01.09.0
mean 3.1 KB79.01.09.0
min 3.1 KB79.01.09.0
max 3.1 KB79.01.09.0
stddev 0 B0.00.00.0
total 3.1 KB79.01.09.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- ------ ---------------- ----------------------------- ------------------\n", "172.16.2.17:54321 3.1 KB 79 1 9\n", "mean 3.1 KB 79 1 9\n", "min 3.1 KB 79 1 9\n", "max 3.1 KB 79 1 9\n", "stddev 0 B 0 0 0\n", "total 3.1 KB 79 1 9" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Column-by-Column Summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Community Area NumberCOMMUNITY AREA NAMEPERCENT OF HOUSING CROWDEDPERCENT HOUSEHOLDS BELOW POVERTYPERCENT AGED 16 UNEMPLOYEDPERCENT AGED 25 WITHOUT HIGH SCHOOL DIPLOMAPERCENT AGED UNDER 18 OR OVER 64PER CAPITA INCOME HARDSHIP INDEX
typeintstringrealrealrealrealrealintint
mins1.0NaN0.33.34.72.513.58201.01.0
maxs77.0NaN15.856.535.954.851.588669.098.0
sigma22.3718573212NaN3.6589814413511.4572309137.4994967086111.74651435117.2844210849415196.405541328.6905556516
zero_count000000000
missing_count201111112
" ], "text/plain": [ " Community Area Number COMMUNITY AREA NAME PERCENT OF HOUSING CROWDED PERCENT HOUSEHOLDS BELOW POVERTY PERCENT AGED 16 UNEMPLOYED PERCENT AGED 25 WITHOUT HIGH SCHOOL DIPLOMA PERCENT AGED UNDER 18 OR OVER 64 PER CAPITA INCOME HARDSHIP INDEX\n", "------------- ----------------------- --------------------- ---------------------------- ---------------------------------- ----------------------------- ---------------------------------------------- ---------------------------------- -------------------- ----------------\n", "type int string real real real real real int int\n", "mins 1.0 NaN 0.3 3.3 4.7 2.5 13.5 8201.0 1.0\n", "maxs 77.0 NaN 15.8 56.5 35.9 54.8 51.5 88669.0 98.0\n", "sigma 22.3718573212 NaN 3.65898144135 11.457230913 7.49949670861 11.7465143511 7.28442108494 15196.4055413 28.6905556516\n", "zero_count 0 0 0 0 0 0 0 0 0\n", "missing_count 2 0 1 1 1 1 1 1 2" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Import and Parse crimes data\n", "\n", "Parse Progress: [##################################################] 100%\n", "Imported /Users/anqi_fu/Documents/workspace/h2o-3/smalldata/chicago/chicagoCrimes10k.csv.zip . Parsed 9,999 rows and 22 cols\n", "Rows: 9,999 Cols: 22\n", "\n", "Chunk compression summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers44.5454545 320 B0.03695244
C11-Byte Integers3236.363636 80.2 KB9.488462
C1N1-Byte Integers (w/o NAs)89.090909 20.1 KB2.3721156
C22-Byte Integers1618.181818 79.2 KB9.362824
C44-Byte Integers1213.636364 118.0 KB13.950008
CStrString89.090909 391.1 KB46.252445
C8D64-bit Reals89.090909 156.8 KB18.537191
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ -------- -----------------\n", "C0L Constant Integers 4 4.54545 320 B 0.0369524\n", "C1 1-Byte Integers 32 36.3636 80.2 KB 9.48846\n", "C1N 1-Byte Integers (w/o NAs) 8 9.09091 20.1 KB 2.37212\n", "C2 2-Byte Integers 16 18.1818 79.2 KB 9.36282\n", "C4 4-Byte Integers 12 13.6364 118.0 KB 13.95\n", "CStr String 8 9.09091 391.1 KB 46.2524\n", "C8D 64-bit Reals 8 9.09091 156.8 KB 18.5372" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.17:54321 845.7 KB9999.04.088.0
mean 845.7 KB9999.04.088.0
min 845.7 KB9999.04.088.0
max 845.7 KB9999.04.088.0
stddev 0 B0.00.00.0
total 845.7 KB9999.04.088.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- -------- ---------------- ----------------------------- ------------------\n", "172.16.2.17:54321 845.7 KB 9999 4 88\n", "mean 845.7 KB 9999 4 88\n", "min 845.7 KB 9999 4 88\n", "max 845.7 KB 9999 4 88\n", "stddev 0 B 0 0 0\n", "total 845.7 KB 9999 4 88" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Column-by-Column Summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
IDCase NumberDateBlockIUCRPrimary TypeDescriptionLocation DescriptionArrestDomesticBeatDistrictWardCommunity AreaFBI CodeX CoordinateY CoordinateYearUpdated OnLatitudeLongitudeLocation
typeintstringstringenumintenumenumenumenumenumintintintintintintintintenumrealrealenum
mins21735.0NaNNaN0.0110.00.00.00.00.00.0111.01.01.01.02.01100317.01814255.02015.00.041.64507243-87.9064638880.0
maxs9962898.0NaNNaN6517.05131.026.0198.090.01.01.02535.025.050.077.026.01205069.01951533.02015.032.042.022646183-87.5247732868603.0
sigma396787.564221NaNNaN1915.88517194927.7514355839.1624173594460.105938202925.59639724630.4550835155880.35934414686695.760298756.9454749330113.649566114421.27487622237.5742385791116496.449368131274.01631990.010.08244643450.08601865793590.06003579706532469.64729385
zero_count0003011933197071847600000000603001
missing_count000041900600016200255716216200162162162
" ], "text/plain": [ " ID Case Number Date Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location\n", "------------- ------------- ------------- ------ ------------- ------------- -------------- ------------- ---------------------- -------------- ------------- ------------ ------------- ------------- ---------------- ------------- -------------- -------------- ------ ------------- --------------- --------------- -------------\n", "type int string string enum int enum enum enum enum enum int int int int int int int int enum real real enum\n", "mins 21735.0 NaN NaN 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 2015.0 0.0 41.64507243 -87.906463888 0.0\n", "maxs 9962898.0 NaN NaN 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 2015.0 32.0 42.022646183 -87.524773286 8603.0\n", "sigma 396787.564221 NaN NaN 1915.88517194 927.751435583 9.16241735944 60.1059382029 25.5963972463 0.455083515588 0.35934414686 695.76029875 6.94547493301 13.6495661144 21.2748762223 7.57423857911 16496.4493681 31274.0163199 0.0 10.0824464345 0.0860186579359 0.0600357970653 2469.64729385\n", "zero_count 0 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1\n", "missing_count 0 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "weather_path = h2o.locate(\"smalldata/chicago/chicagoAllWeather.csv\")\n", "census_path = h2o.locate(\"smalldata/chicago/chicagoCensus.csv\")\n", "crimes_path = h2o.locate(\"smalldata/chicago/chicagoCrimes10k.csv.zip\")\n", "\n", "print \"Import and Parse weather data\"\n", "weather = h2o.import_frame(path=weather_path)\n", "weather.drop(\"date\")\n", "weather.describe()\n", "\n", "print \"Import and Parse census data\"\n", "census = h2o.import_frame(path=census_path)\n", "census.describe()\n", "\n", "print \"Import and Parse crimes data\"\n", "crimes = h2o.import_frame(path=crimes_path)\n", "crimes.describe()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows: 9,999 Cols: 27\n", "\n", "Chunk compression summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers98.333334 720 B0.10067465
C11-Byte Integers3229.62963 80.2 KB11.489216
C1N1-Byte Integers (w/o NAs)2321.296297 57.9 KB8.29671
C22-Byte Integers1614.814815 79.2 KB11.337085
C44-Byte Integers1211.111112 118.0 KB16.891531
C864-bit Integers43.7037036 78.4 KB11.222987
CStrString43.7037036 127.2 KB18.215822
C8D64-bit Reals87.4074073 156.8 KB22.445974
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ -------- -----------------\n", "C0L Constant Integers 9 8.33333 720 B 0.100675\n", "C1 1-Byte Integers 32 29.6296 80.2 KB 11.4892\n", "C1N 1-Byte Integers (w/o NAs) 23 21.2963 57.9 KB 8.29671\n", "C2 2-Byte Integers 16 14.8148 79.2 KB 11.3371\n", "C4 4-Byte Integers 12 11.1111 118.0 KB 16.8915\n", "C8 64-bit Integers 4 3.7037 78.4 KB 11.223\n", "CStr String 4 3.7037 127.2 KB 18.2158\n", "C8D 64-bit Reals 8 7.40741 156.8 KB 22.446" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.17:54321 698.4 KB9999.04.0108.0
mean 698.4 KB9999.04.0108.0
min 698.4 KB9999.04.0108.0
max 698.4 KB9999.04.0108.0
stddev 0 B0.00.00.0
total 698.4 KB9999.04.0108.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- -------- ---------------- ----------------------------- ------------------\n", "172.16.2.17:54321 698.4 KB 9999 4 108\n", "mean 698.4 KB 9999 4 108\n", "min 698.4 KB 9999 4 108\n", "max 698.4 KB 9999 4 108\n", "stddev 0 B 0 0 0\n", "total 698.4 KB 9999 4 108" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Column-by-Column Summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
IDCase NumberDateBlockIUCRPrimary TypeDescriptionLocation DescriptionArrestDomesticBeatDistrictWardCommunity AreaFBI CodeX CoordinateY CoordinateYearUpdated OnLatitudeLongitudeLocationDayMonthWeekNumWeekDayHourOfDay
typeintstringintenumintenumenumenumenumenumintintintintintintintintenumrealrealenumintintintenumint
mins21735.0NaN1.42203063e+120.0110.00.00.00.00.00.0111.01.01.01.02.01100317.01814255.03915.00.041.64507243-87.9064638880.01.02.04.00.00.0
maxs9962898.0NaN1.42346782e+126517.05131.026.0198.090.01.01.02535.025.050.077.026.01205069.01951533.03915.032.042.022646183-87.5247732868603.031.03.06.06.023.0
sigma396787.564221NaN433879245.1881915.88517194927.7514355839.1624173594460.105938202925.59639724630.4550835155880.35934414686695.760298756.9454749330113.649566114421.27487622237.5742385791116496.449368131274.01631990.010.08244643450.08601865793590.06003579706532469.6472938511.18010433580.4934924067870.7389298304091.932840564326.47321735807
zero_count00030119331970718476000000006030010001038374
missing_count00004190060001620025571621620016216216200000
" ], "text/plain": [ " ID Case Number Date Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay\n", "------------- ------------- ------------- -------------- ------------- ------------- -------------- ------------- ---------------------- -------------- ------------- ------------ ------------- ------------- ---------------- ------------- -------------- -------------- ------ ------------- --------------- --------------- ------------- ------------- -------------- -------------- ------------- -------------\n", "type int string int enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int\n", "mins 21735.0 NaN 1.42203063e+12 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 3915.0 0.0 41.64507243 -87.906463888 0.0 1.0 2.0 4.0 0.0 0.0\n", "maxs 9962898.0 NaN 1.42346782e+12 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 3915.0 32.0 42.022646183 -87.524773286 8603.0 31.0 3.0 6.0 6.0 23.0\n", "sigma 396787.564221 NaN 433879245.188 1915.88517194 927.751435583 9.16241735944 60.1059382029 25.5963972463 0.455083515588 0.35934414686 695.76029875 6.94547493301 13.6495661144 21.2748762223 7.57423857911 16496.4493681 31274.0163199 0.0 10.0824464345 0.0860186579359 0.0600357970653 2469.64729385 11.1801043358 0.493492406787 0.738929830409 1.93284056432 6.47321735807\n", "zero_count 0 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374\n", "missing_count 0 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Rows: 9,999 Cols: 28\n", "\n", "Chunk compression summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers1311.607142 1.0 KB0.16332634
CBSBits43.5714288 1.5 KB0.2404352
C11-Byte Integers3228.57143 80.2 KB12.9040365
C1N1-Byte Integers (w/o NAs)2320.535715 57.9 KB9.318395
C22-Byte Integers1614.285715 79.2 KB12.733171
C44-Byte Integers1210.714286 118.0 KB18.97161
CStrString43.5714288 127.2 KB20.458979
C8D64-bit Reals87.1428576 156.8 KB25.210047
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ -------- -----------------\n", "C0L Constant Integers 13 11.6071 1.0 KB 0.163326\n", "CBS Bits 4 3.57143 1.5 KB 0.240435\n", "C1 1-Byte Integers 32 28.5714 80.2 KB 12.904\n", "C1N 1-Byte Integers (w/o NAs) 23 20.5357 57.9 KB 9.3184\n", "C2 2-Byte Integers 16 14.2857 79.2 KB 12.7332\n", "C4 4-Byte Integers 12 10.7143 118.0 KB 18.9716\n", "CStr String 4 3.57143 127.2 KB 20.459\n", "C8D 64-bit Reals 8 7.14286 156.8 KB 25.21" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.17:54321 621.8 KB9999.04.0112.0
mean 621.8 KB9999.04.0112.0
min 621.8 KB9999.04.0112.0
max 621.8 KB9999.04.0112.0
stddev 0 B0.00.00.0
total 621.8 KB9999.04.0112.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- -------- ---------------- ----------------------------- ------------------\n", "172.16.2.17:54321 621.8 KB 9999 4 112\n", "mean 621.8 KB 9999 4 112\n", "min 621.8 KB 9999 4 112\n", "max 621.8 KB 9999 4 112\n", "stddev 0 B 0 0 0\n", "total 621.8 KB 9999 4 112" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Column-by-Column Summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
IDCase NumberBlockIUCRPrimary TypeDescriptionLocation DescriptionArrestDomesticBeatDistrictWardCommunity AreaFBI CodeX CoordinateY CoordinateYearUpdated OnLatitudeLongitudeLocationDayMonthWeekNumWeekDayHourOfDayWeekendSeason
typeintstringenumintenumenumenumenumenumintintintintintintintintenumrealrealenumintintintenumintintenum
mins21735.0NaN0.0110.00.00.00.00.00.0111.01.01.01.02.01100317.01814255.03915.00.041.64507243-87.9064638880.01.02.04.00.00.00.00.0
maxs9962898.0NaN6517.05131.026.0198.090.01.01.02535.025.050.077.026.01205069.01951533.03915.032.042.022646183-87.5247732868603.031.03.06.06.023.01.01.0
sigma396787.564221NaN1915.88517194927.7514355839.1624173594460.105938202925.59639724630.4550835155880.35934414686695.760298756.9454749330113.649566114421.27487622237.5742385791116496.449368131274.01631990.010.08244643450.08601865793590.06003579706532469.6472938511.18010433580.4934924067870.7389298304091.932840564326.473217358070.3658024340410.493492406787
zero_count003011933197071847600000000603001000103837484085805
missing_count000419006000162002557162162001621621620000000
" ], "text/plain": [ " ID Case Number Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay Weekend Season\n", "------------- ------------- ------------- ------------- ------------- -------------- ------------- ---------------------- -------------- ------------- ------------ ------------- ------------- ---------------- ------------- -------------- -------------- ------ ------------- --------------- --------------- ------------- ------------- -------------- -------------- ------------- ------------- -------------- --------------\n", "type int string enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int int enum\n", "mins 21735.0 NaN 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 3915.0 0.0 41.64507243 -87.906463888 0.0 1.0 2.0 4.0 0.0 0.0 0.0 0.0\n", "maxs 9962898.0 NaN 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 3915.0 32.0 42.022646183 -87.524773286 8603.0 31.0 3.0 6.0 6.0 23.0 1.0 1.0\n", "sigma 396787.564221 NaN 1915.88517194 927.751435583 9.16241735944 60.1059382029 25.5963972463 0.455083515588 0.35934414686 695.76029875 6.94547493301 13.6495661144 21.2748762223 7.57423857911 16496.4493681 31274.0163199 0.0 10.0824464345 0.0860186579359 0.0600357970653 2469.64729385 11.1801043358 0.493492406787 0.738929830409 1.93284056432 6.47321735807 0.365802434041 0.493492406787\n", "zero_count 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374 8408 5805\n", "missing_count 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0 0 0" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def refine_date_col(data, col, pattern):\n", " data[col] = data[col].as_date(pattern)\n", " data[\"Day\"] = data[col].day()\n", " data[\"Month\"] = data[col].month() + 1 # Since H2O indexes from 0\n", " data[\"Year\"] = data[col].year() + 1900 # Start of epoch is 1900\n", " data[\"WeekNum\"] = data[col].week()\n", " data[\"WeekDay\"] = data[col].dayOfWeek()\n", " data[\"HourOfDay\"] = data[col].hour()\n", " \n", " data.describe() # HACK: Force evaluation before ifelse and cut. See PUBDEV-1425.\n", " \n", " # Create weekend and season cols\n", " # Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.\n", " # data[\"Weekend\"] = [1 if x in (\"Sun\", \"Sat\") else 0 for x in data[\"WeekDay\"]]\n", " data[\"Weekend\"] = h2o.ifelse(data[\"WeekDay\"] == \"Sun\" or data[\"WeekDay\"] == \"Sat\", 1, 0)[0]\n", " data[\"Season\"] = data[\"Month\"].cut([0, 2, 5, 7, 10, 12], [\"Winter\", \"Spring\", \"Summer\", \"Autumn\", \"Winter\"])\n", " \n", "refine_date_col(crimes, \"Date\", \"%m/%d/%Y %I:%M:%S %p\")\n", "crimes = crimes.drop(\"Date\")\n", "crimes.describe()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "ename": "EnvironmentError", "evalue": "h2o-py got an unexpected HTTP status code:\n 412 Precondition Failed (method = POST; url = http://localhost:54321/99/Rapids). \ndetailed error messages: water.DException$DistributedException: from /172.16.2.17:54321; by class water.rapids.ASTMerge$MergeSet$MakeHash; class water.exceptions.H2OIllegalArgumentException: unimplemented", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mEnvironmentError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mweather\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"day\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0m_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Day\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mweather\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"year\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0m_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Year\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mcrimes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmerge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcensus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mallLeft\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mallRite\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0mcrimes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmerge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mweather\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mallLeft\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mallRite\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/frame.pyc\u001b[0m in \u001b[0;36mmerge\u001b[0;34m(self, other, allLeft, allRite)\u001b[0m\n\u001b[1;32m 1022\u001b[0m \u001b[0mexpr2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"(, \"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mexpr\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\" (del %\"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mlkey\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\" #0) (del %\"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mrkey\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\" #0) )\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1023\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1024\u001b[0;31m \u001b[0mh2o\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrapids\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpr2\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# merge in h2o\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1025\u001b[0m \u001b[0;31m# Make backing H2OVecs for the remote h2o vecs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1026\u001b[0m \u001b[0mj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh2o\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtmp_key\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# Fetch the frame as JSON\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/h2o.pyc\u001b[0m in \u001b[0;36mrapids\u001b[0;34m(expr)\u001b[0m\n\u001b[1;32m 487\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mThe\u001b[0m \u001b[0mJSON\u001b[0m \u001b[0mresponse\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mRapids\u001b[0m \u001b[0mexecution\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 488\u001b[0m \"\"\"\n\u001b[0;32m--> 489\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mH2OConnection\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpost_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Rapids\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mast\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murllib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquote\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_rest_version\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m99\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 490\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'error'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 491\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mEnvironmentError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"rapids expression not evaluated: {0}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'error'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc\u001b[0m in \u001b[0;36mpost_json\u001b[0;34m(url_suffix, file_upload_info, **kwargs)\u001b[0m\n\u001b[1;32m 360\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0m__H2OCONN__\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 361\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"No h2o connection. Did you run `h2o.init()` ?\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 362\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m__H2OCONN__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_rest_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl_suffix\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"POST\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_upload_info\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 363\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_rest_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl_suffix\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_upload_info\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc\u001b[0m in \u001b[0;36m_rest_json\u001b[0;34m(self, url_suffix, method, file_upload_info, **kwargs)\u001b[0m\n\u001b[1;32m 363\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_rest_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl_suffix\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_upload_info\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 365\u001b[0;31m \u001b[0mraw_txt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_do_raw_rest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl_suffix\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_upload_info\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 366\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_process_tables\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_txt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 367\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc\u001b[0m in \u001b[0;36m_do_raw_rest\u001b[0;34m(self, url_suffix, method, file_upload_info, **kwargs)\u001b[0m\n\u001b[1;32m 429\u001b[0m raise EnvironmentError((\"h2o-py got an unexpected HTTP status code:\\n {} {} (method = {}; url = {}). \\n\"+ \\\n\u001b[1;32m 430\u001b[0m \"detailed error messages: {}\")\n\u001b[0;32m--> 431\u001b[0;31m .format(http_result.status_code,http_result.reason,method,url,detailed_error_msgs))\n\u001b[0m\u001b[1;32m 432\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 433\u001b[0m \u001b[0;31m# TODO: is.logging? -> write to logs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mEnvironmentError\u001b[0m: h2o-py got an unexpected HTTP status code:\n 412 Precondition Failed (method = POST; url = http://localhost:54321/99/Rapids). \ndetailed error messages: water.DException$DistributedException: from /172.16.2.17:54321; by class water.rapids.ASTMerge$MergeSet$MakeHash; class water.exceptions.H2OIllegalArgumentException: unimplemented" ] } ], "source": [ "# Merge crimes data with weather and census\n", "census[\"Community Area Number\"]._name = \"Community Area\"\n", "weather[\"month\"]._name = \"Month\"\n", "weather[\"day\"] ._name = \"Day\"\n", "weather[\"year\"] ._name = \"Year\"\n", "crimes.merge(census, allLeft=True, allRite=False)\n", "crimes.merge(weather, allLeft=True, allRite=False)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "ename": "NameError", "evalue": "name 'data' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Create test/train split\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdata_split\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh2o\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit_frame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mratios\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m0.8\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m0.2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mtrain\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata_split\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtest\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata_split\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'data' is not defined" ] } ], "source": [ "# Create test/train split\n", "data_split = h2o.split_frame(data, ratios = [0.8,0.2])\n", "train = data_split[1]\n", "test = data_split[2]\n", "\n", "# Simple GBM - Predict Arrest\n", "data_gbm = h2o.gbm(x =train.drop(\"Arrest\"),\n", " y =train [\"Arrest\"],\n", " validation_x =test .drop(\"Arrest\"),\n", " validation_y =test [\"Arrest\"],\n", " ntrees =10,\n", " max_depth =6,\n", " distribution =\"bernoulli\")\n", "\n", "# Simple Deep Learning\n", "data_dl = h2o.deeplearning(x =train.drop(\"Arrest\"),\n", " y =train [\"Arrest\"],\n", " validation_x =test .drop(\"Arrest\"),\n", " validation_y =test [\"Arrest\"],\n", " variable_importances=True,\n", " loss =\"Automatic\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "ename": "NameError", "evalue": "name 'data_gbm' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# GBM performance on train/test data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mtrain_auc_gbm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata_gbm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel_performance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mtest_auc_gbm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata_gbm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel_performance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mauc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;31m# Deep Learning performance on train/test data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'data_gbm' is not defined" ] } ], "source": [ "# GBM performance on train/test data\n", "train_auc_gbm = data_gbm.model_performance(train).auc()\n", "test_auc_gbm = data_gbm.model_performance(test) .auc()\n", "\n", "# Deep Learning performance on train/test data\n", "train_auc_dl = data_dl.model_performance(train).auc()\n", "test_auc_dl = data_dl.model_performance(test) .auc()\n", "\n", "# Make a pretty HTML table printout of the results\n", "header = [\"Model\", \"AUC Train\", \"AUC Test\"]\n", "table = [\n", " [\"GBM\", train_auc_gbm, test_auc_gbm],\n", " [\"DL \", train_auc_dl, test_auc_dl]\n", " ]\n", "h2o.H2ODisplay(table, header)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Parse Progress: [##################################################] 100%\n", "Uploaded py634b18a9-7e84-40ca-b265-b2fe43e064aa into cluster with 2 rows and 10 cols\n", "Rows: 2 Cols: 16\n", "\n", "Chunk compression summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers743.75 560 B43.818466
C1N1-Byte Integers (w/o NAs)425.0 280 B21.909233
C22-Byte Integers212.5 144 B11.267606
C2S2-Byte Fractions16.25 88 B6.885759
CStrString212.5 206 B16.118937
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ ------ -----------------\n", "C0L Constant Integers 7 43.75 560 B 43.8185\n", "C1N 1-Byte Integers (w/o NAs) 4 25 280 B 21.9092\n", "C2 2-Byte Integers 2 12.5 144 B 11.2676\n", "C2S 2-Byte Fractions 1 6.25 88 B 6.88576\n", "CStr String 2 12.5 206 B 16.1189" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.17:54321 1.2 KB2.01.016.0
mean 1.2 KB2.01.016.0
min 1.2 KB2.01.016.0
max 1.2 KB2.01.016.0
stddev 0 B0.00.00.0
total 1.2 KB2.01.016.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- ------ ---------------- ----------------------------- ------------------\n", "172.16.2.17:54321 1.2 KB 2 1 16\n", "mean 1.2 KB 2 1 16\n", "min 1.2 KB 2 1 16\n", "max 1.2 KB 2 1 16\n", "stddev 0 B 0 0 0\n", "total 1.2 KB 2 1 16" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Column-by-Column Summary:\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Location.DescriptionFBI.CodePrimary.TypeCommunity.AreaDistrictBeatDomesticIUCRDateWardDayMonthYearWeekNumWeekDayHourOfDay
typestringintstringintintintenumintintintintintintintenumint
minsNaN11.0NaN46.04.0422.00.01150.01.423465239e+127.08.03.03915.06.06.023.0
maxsNaN18.0NaN63.09.0923.00.01811.01.423467838e+1214.08.03.03915.06.06.023.0
sigmaNaN4.94974746831NaN12.02081528023.53553390593354.2604973740.0467.3975823641837770.52434.949747468310.00.00.00.00.00.0
zero_count0000002000000000
missing_count0000000000000000
" ], "text/plain": [ " Location.Description FBI.Code Primary.Type Community.Area District Beat Domestic IUCR Date Ward Day Month Year WeekNum WeekDay HourOfDay\n", "------------- ---------------------- ------------- -------------- ---------------- ------------- ------------- ---------- ------------- --------------- ------------- ----- ------- ------ --------- --------- -----------\n", "type string int string int int int enum int int int int int int int enum int\n", "mins NaN 11.0 NaN 46.0 4.0 422.0 0.0 1150.0 1.423465239e+12 7.0 8.0 3.0 3915.0 6.0 6.0 23.0\n", "maxs NaN 18.0 NaN 63.0 9.0 923.0 0.0 1811.0 1.423467838e+12 14.0 8.0 3.0 3915.0 6.0 6.0 23.0\n", "sigma NaN 4.94974746831 NaN 12.0208152802 3.53553390593 354.260497374 0.0 467.397582364 1837770.5243 4.94974746831 0.0 0.0 0.0 0.0 0.0 0.0\n", "zero_count 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0\n", "missing_count 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0" ] }, "metadata": {}, "output_type": "display_data" }, { "ename": "EnvironmentError", "evalue": "h2o-py got an unexpected HTTP status code:\n 412 Precondition Failed (method = POST; url = http://localhost:54321/99/Rapids). \ndetailed error messages: Data vector is constant!", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mEnvironmentError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0;31m# Refine date column and merge with census data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 18\u001b[0;31m \u001b[0mrefine_date_col\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcrime_examples\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Date\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"%m/%d/%Y %I:%M:%S %p\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 19\u001b[0m \u001b[0mcrime_examples\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Date\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0mcrime_examples\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmerge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcensus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mallLeft\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mallRite\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m\u001b[0m in \u001b[0;36mrefine_date_col\u001b[0;34m(data, col, pattern)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;31m# data[\"Weekend\"] = h2o.ifelse(data[\"WeekDay\"] in (\"Sun\", \"Sat\"), 1, 0)[0]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Weekend\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh2o\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mifelse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"WeekDay\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"Sun\"\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"WeekDay\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"Sat\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Season\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Month\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcut\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m7\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m12\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"Winter\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Spring\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Summer\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Autumn\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Winter\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0mrefine_date_col\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcrimes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Date\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"%m/%d/%Y %I:%M:%S %p\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/frame.pyc\u001b[0m in \u001b[0;36mcut\u001b[0;34m(self, breaks, labels, include_lowest, right, dig_lab)\u001b[0m\n\u001b[1;32m 1256\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1257\u001b[0m \u001b[0mexpr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"(cut '{}' {} {} {} {} #{}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbreaks_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"%TRUE\"\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minclude_lowest\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m\"%FALSE\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"%TRUE\"\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mright\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m\"%FALSE\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdig_lab\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1258\u001b[0;31m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh2o\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrapids\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1259\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mH2OVec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mExpr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mop\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mres\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"vec_ids\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"name\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlength\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mres\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"num_rows\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1260\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/h2o.pyc\u001b[0m in \u001b[0;36mrapids\u001b[0;34m(expr)\u001b[0m\n\u001b[1;32m 487\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mThe\u001b[0m \u001b[0mJSON\u001b[0m \u001b[0mresponse\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mRapids\u001b[0m \u001b[0mexecution\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 488\u001b[0m \"\"\"\n\u001b[0;32m--> 489\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mH2OConnection\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpost_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Rapids\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mast\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murllib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquote\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_rest_version\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m99\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 490\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'error'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 491\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mEnvironmentError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"rapids expression not evaluated: {0}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'error'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc\u001b[0m in \u001b[0;36mpost_json\u001b[0;34m(url_suffix, file_upload_info, **kwargs)\u001b[0m\n\u001b[1;32m 360\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0m__H2OCONN__\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 361\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"No h2o connection. Did you run `h2o.init()` ?\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 362\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m__H2OCONN__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_rest_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl_suffix\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"POST\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_upload_info\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 363\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_rest_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl_suffix\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_upload_info\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc\u001b[0m in \u001b[0;36m_rest_json\u001b[0;34m(self, url_suffix, method, file_upload_info, **kwargs)\u001b[0m\n\u001b[1;32m 363\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_rest_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl_suffix\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_upload_info\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 365\u001b[0;31m \u001b[0mraw_txt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_do_raw_rest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl_suffix\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_upload_info\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 366\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_process_tables\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_txt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 367\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc\u001b[0m in \u001b[0;36m_do_raw_rest\u001b[0;34m(self, url_suffix, method, file_upload_info, **kwargs)\u001b[0m\n\u001b[1;32m 429\u001b[0m raise EnvironmentError((\"h2o-py got an unexpected HTTP status code:\\n {} {} (method = {}; url = {}). \\n\"+ \\\n\u001b[1;32m 430\u001b[0m \"detailed error messages: {}\")\n\u001b[0;32m--> 431\u001b[0;31m .format(http_result.status_code,http_result.reason,method,url,detailed_error_msgs))\n\u001b[0m\u001b[1;32m 432\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 433\u001b[0m \u001b[0;31m# TODO: is.logging? -> write to logs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mEnvironmentError\u001b[0m: h2o-py got an unexpected HTTP status code:\n 412 Precondition Failed (method = POST; url = http://localhost:54321/99/Rapids). \ndetailed error messages: Data vector is constant!" ] } ], "source": [ "# Create new H2OFrame of crime observations\n", "examples = {\n", " \"Date\": [\"02/08/2015 11:43:58 PM\", \"02/08/2015 11:00:39 PM\"],\n", " \"IUCR\": [1811, 1150],\n", " \"Primary.Type\": [\"NARCOTICS\", \"DECEPTIVE PRACTICE\"],\n", " \"Location.Description\": [\"STREET\", \"RESIDENCE\"],\n", " \"Domestic\": [\"false\", \"false\"],\n", " \"Beat\": [422, 923],\n", " \"District\": [4, 9],\n", " \"Ward\": [7, 14],\n", " \"Community.Area\": [46, 63],\n", " \"FBI.Code\": [18, 11]\n", " }\n", "\n", "crime_examples = h2o.H2OFrame(python_obj = examples)\n", "\n", "# Refine date column and merge with census data\n", "refine_date_col(crime_examples, \"Date\", \"%m/%d/%Y %I:%M:%S %p\")\n", "crime_examples.drop(\"Date\")\n", "crime_examples.merge(census, allLeft=True, allRite=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Predict probability of arrest from new observations\n", "gbm_pred = data_gbm.predict(crime_examples)\n", "dl_pred = data_dl .predict(crime_examples)\n", "\n", "# TODO: Replace with a pretty HTML table\n", "gbm_pred.describe()\n", "dl_pred.describe()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }