{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import h2o" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td>H2O cluster uptime: </td>\n", "<td>17 seconds 548 milliseconds </td></tr>\n", "<tr><td>H2O cluster version: </td>\n", "<td>3.1.0.99999</td></tr>\n", "<tr><td>H2O cluster name: </td>\n", "<td>anqi_fu</td></tr>\n", "<tr><td>H2O cluster total nodes: </td>\n", "<td>1</td></tr>\n", "<tr><td>H2O cluster total memory: </td>\n", "<td>1.78 GB</td></tr>\n", "<tr><td>H2O cluster total cores: </td>\n", "<td>8</td></tr>\n", "<tr><td>H2O cluster allowed cores: </td>\n", "<td>8</td></tr>\n", "<tr><td>H2O cluster healthy: </td>\n", "<td>True</td></tr>\n", "<tr><td>H2O Connection ip: </td>\n", "<td>127.0.0.1</td></tr>\n", "<tr><td>H2O Connection port: </td>\n", "<td>54321</td></tr></table></div>" ], "text/plain": [ "-------------------------- ---------------------------\n", "H2O cluster uptime: 17 seconds 548 milliseconds\n", "H2O cluster version: 3.1.0.99999\n", "H2O cluster name: anqi_fu\n", "H2O cluster total nodes: 1\n", "H2O cluster total memory: 1.78 GB\n", "H2O cluster total cores: 8\n", "H2O cluster allowed cores: 8\n", "H2O cluster healthy: True\n", "H2O Connection ip: 127.0.0.1\n", "H2O Connection port: 54321\n", "-------------------------- ---------------------------" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Connect to a cluster\n", "h2o.init()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Import and Parse weather data\n", "\n", "Parse Progress: [##################################################] 100%\n", "Imported /Users/anqi_fu/Documents/workspace/h2o-3/smalldata/chicago/chicagoAllWeather.csv . Parsed 5,162 rows and 7 cols\n", "Rows: 5,162 Cols: 7\n", "\n", "Chunk compression summary:\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td>chunk_type</td>\n", "<td>chunk_name</td>\n", "<td>count</td>\n", "<td>count_percentage</td>\n", "<td>size</td>\n", "<td>size_percentage</td></tr>\n", "<tr><td>C1N</td>\n", "<td>1-Byte Integers (w/o NAs)</td>\n", "<td>2</td>\n", "<td>28.57143</td>\n", "<td> 10.2 KB</td>\n", "<td>11.221008</td></tr>\n", "<tr><td>C1S</td>\n", "<td>1-Byte Fractions</td>\n", "<td>4</td>\n", "<td>57.14286</td>\n", "<td> 20.5 KB</td>\n", "<td>22.510675</td></tr>\n", "<tr><td>CStr</td>\n", "<td>String</td>\n", "<td>1</td>\n", "<td>14.285715</td>\n", "<td> 60.3 KB</td>\n", "<td>66.26832</td></tr></table></div>" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ ------- -----------------\n", "C1N 1-Byte Integers (w/o NAs) 2 28.5714 10.2 KB 11.221\n", "C1S 1-Byte Fractions 4 57.1429 20.5 KB 22.5107\n", "CStr String 1 14.2857 60.3 KB 66.2683" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td></td>\n", "<td>size</td>\n", "<td>number_of_rows</td>\n", "<td>number_of_chunks_per_column</td>\n", "<td>number_of_chunks</td></tr>\n", "<tr><td>172.16.2.17:54321</td>\n", "<td> 91.0 KB</td>\n", "<td>5162.0</td>\n", "<td>1.0</td>\n", "<td>7.0</td></tr>\n", "<tr><td>mean</td>\n", "<td> 91.0 KB</td>\n", "<td>5162.0</td>\n", "<td>1.0</td>\n", "<td>7.0</td></tr>\n", "<tr><td>min</td>\n", "<td> 91.0 KB</td>\n", "<td>5162.0</td>\n", "<td>1.0</td>\n", "<td>7.0</td></tr>\n", "<tr><td>max</td>\n", "<td> 91.0 KB</td>\n", "<td>5162.0</td>\n", "<td>1.0</td>\n", "<td>7.0</td></tr>\n", "<tr><td>stddev</td>\n", "<td> 0 B</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td></tr>\n", "<tr><td>total</td>\n", "<td> 91.0 KB</td>\n", "<td>5162.0</td>\n", "<td>1.0</td>\n", "<td>7.0</td></tr></table></div>" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- ------- ---------------- ----------------------------- ------------------\n", "172.16.2.17:54321 91.0 KB 5162 1 7\n", "mean 91.0 KB 5162 1 7\n", "min 91.0 KB 5162 1 7\n", "max 91.0 KB 5162 1 7\n", "stddev 0 B 0 0 0\n", "total 91.0 KB 5162 1 7" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Column-by-Column Summary:\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td></td>\n", "<td>date</td>\n", "<td>month</td>\n", "<td>day</td>\n", "<td>year</td>\n", "<td>maxTemp</td>\n", "<td>meanTemp</td>\n", "<td>minTemp</td></tr>\n", "<tr><td>type</td>\n", "<td>string</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td></tr>\n", "<tr><td>mins</td>\n", "<td>NaN</td>\n", "<td>1.0</td>\n", "<td>1.0</td>\n", "<td>2001.0</td>\n", "<td>-2.0</td>\n", "<td>-9.0</td>\n", "<td>-18.0</td></tr>\n", "<tr><td>maxs</td>\n", "<td>NaN</td>\n", "<td>12.0</td>\n", "<td>31.0</td>\n", "<td>2015.0</td>\n", "<td>103.0</td>\n", "<td>93.0</td>\n", "<td>82.0</td></tr>\n", "<tr><td>sigma</td>\n", "<td>NaN</td>\n", "<td>3.46905171694</td>\n", "<td>8.79895173997</td>\n", "<td>4.0773409057</td>\n", "<td>21.4829777237</td>\n", "<td>19.9302399266</td>\n", "<td>19.0207297123</td></tr>\n", "<tr><td>zero_count</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>2</td>\n", "<td>16</td></tr>\n", "<tr><td>missing_count</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>13</td>\n", "<td>13</td>\n", "<td>13</td></tr></table></div>" ], "text/plain": [ " date month day year maxTemp meanTemp minTemp\n", "------------- ------ ------------- ------------- ------------ ------------- ------------- -------------\n", "type string int int int int int int\n", "mins NaN 1.0 1.0 2001.0 -2.0 -9.0 -18.0\n", "maxs NaN 12.0 31.0 2015.0 103.0 93.0 82.0\n", "sigma NaN 3.46905171694 8.79895173997 4.0773409057 21.4829777237 19.9302399266 19.0207297123\n", "zero_count 0 0 0 0 0 2 16\n", "missing_count 0 0 0 0 13 13 13" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Import and Parse census data\n", "\n", "Parse Progress: [##################################################] 100%\n", "Imported /Users/anqi_fu/Documents/workspace/h2o-3/smalldata/chicago/chicagoCensus.csv . Parsed 79 rows and 9 cols\n", "Rows: 79 Cols: 9\n", "\n", "Chunk compression summary:\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td>chunk_type</td>\n", "<td>chunk_name</td>\n", "<td>count</td>\n", "<td>count_percentage</td>\n", "<td>size</td>\n", "<td>size_percentage</td></tr>\n", "<tr><td>C1</td>\n", "<td>1-Byte Integers</td>\n", "<td>2</td>\n", "<td>22.222223</td>\n", "<td> 294 B</td>\n", "<td>9.312638</td></tr>\n", "<tr><td>C1S</td>\n", "<td>1-Byte Fractions</td>\n", "<td>1</td>\n", "<td>11.111112</td>\n", "<td> 163 B</td>\n", "<td>5.1631293</td></tr>\n", "<tr><td>C2S</td>\n", "<td>2-Byte Fractions</td>\n", "<td>4</td>\n", "<td>44.444447</td>\n", "<td> 968 B</td>\n", "<td>30.662022</td></tr>\n", "<tr><td>C4</td>\n", "<td>4-Byte Integers</td>\n", "<td>1</td>\n", "<td>11.111112</td>\n", "<td> 384 B</td>\n", "<td>12.163446</td></tr>\n", "<tr><td>CStr</td>\n", "<td>String</td>\n", "<td>1</td>\n", "<td>11.111112</td>\n", "<td> 1.3 KB</td>\n", "<td>42.698765</td></tr></table></div>" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ---------------- ------- ------------------ ------ -----------------\n", "C1 1-Byte Integers 2 22.2222 294 B 9.31264\n", "C1S 1-Byte Fractions 1 11.1111 163 B 5.16313\n", "C2S 2-Byte Fractions 4 44.4444 968 B 30.662\n", "C4 4-Byte Integers 1 11.1111 384 B 12.1634\n", "CStr String 1 11.1111 1.3 KB 42.6988" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td></td>\n", "<td>size</td>\n", "<td>number_of_rows</td>\n", "<td>number_of_chunks_per_column</td>\n", "<td>number_of_chunks</td></tr>\n", "<tr><td>172.16.2.17:54321</td>\n", "<td> 3.1 KB</td>\n", "<td>79.0</td>\n", "<td>1.0</td>\n", "<td>9.0</td></tr>\n", "<tr><td>mean</td>\n", "<td> 3.1 KB</td>\n", "<td>79.0</td>\n", "<td>1.0</td>\n", "<td>9.0</td></tr>\n", "<tr><td>min</td>\n", "<td> 3.1 KB</td>\n", "<td>79.0</td>\n", "<td>1.0</td>\n", "<td>9.0</td></tr>\n", "<tr><td>max</td>\n", "<td> 3.1 KB</td>\n", "<td>79.0</td>\n", "<td>1.0</td>\n", "<td>9.0</td></tr>\n", "<tr><td>stddev</td>\n", "<td> 0 B</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td></tr>\n", "<tr><td>total</td>\n", "<td> 3.1 KB</td>\n", "<td>79.0</td>\n", "<td>1.0</td>\n", "<td>9.0</td></tr></table></div>" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- ------ ---------------- ----------------------------- ------------------\n", "172.16.2.17:54321 3.1 KB 79 1 9\n", "mean 3.1 KB 79 1 9\n", "min 3.1 KB 79 1 9\n", "max 3.1 KB 79 1 9\n", "stddev 0 B 0 0 0\n", "total 3.1 KB 79 1 9" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Column-by-Column Summary:\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td></td>\n", "<td>Community Area Number</td>\n", "<td>COMMUNITY AREA NAME</td>\n", "<td>PERCENT OF HOUSING CROWDED</td>\n", "<td>PERCENT HOUSEHOLDS BELOW POVERTY</td>\n", "<td>PERCENT AGED 16 UNEMPLOYED</td>\n", "<td>PERCENT AGED 25 WITHOUT HIGH SCHOOL DIPLOMA</td>\n", "<td>PERCENT AGED UNDER 18 OR OVER 64</td>\n", "<td>PER CAPITA INCOME </td>\n", "<td>HARDSHIP INDEX</td></tr>\n", "<tr><td>type</td>\n", "<td>int</td>\n", "<td>string</td>\n", "<td>real</td>\n", "<td>real</td>\n", "<td>real</td>\n", "<td>real</td>\n", "<td>real</td>\n", "<td>int</td>\n", "<td>int</td></tr>\n", "<tr><td>mins</td>\n", "<td>1.0</td>\n", "<td>NaN</td>\n", "<td>0.3</td>\n", "<td>3.3</td>\n", "<td>4.7</td>\n", "<td>2.5</td>\n", "<td>13.5</td>\n", "<td>8201.0</td>\n", "<td>1.0</td></tr>\n", "<tr><td>maxs</td>\n", "<td>77.0</td>\n", "<td>NaN</td>\n", "<td>15.8</td>\n", "<td>56.5</td>\n", "<td>35.9</td>\n", "<td>54.8</td>\n", "<td>51.5</td>\n", "<td>88669.0</td>\n", "<td>98.0</td></tr>\n", "<tr><td>sigma</td>\n", "<td>22.3718573212</td>\n", "<td>NaN</td>\n", "<td>3.65898144135</td>\n", "<td>11.457230913</td>\n", "<td>7.49949670861</td>\n", "<td>11.7465143511</td>\n", "<td>7.28442108494</td>\n", "<td>15196.4055413</td>\n", "<td>28.6905556516</td></tr>\n", "<tr><td>zero_count</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td></tr>\n", "<tr><td>missing_count</td>\n", "<td>2</td>\n", "<td>0</td>\n", "<td>1</td>\n", "<td>1</td>\n", "<td>1</td>\n", "<td>1</td>\n", "<td>1</td>\n", "<td>1</td>\n", "<td>2</td></tr></table></div>" ], "text/plain": [ " Community Area Number COMMUNITY AREA NAME PERCENT OF HOUSING CROWDED PERCENT HOUSEHOLDS BELOW POVERTY PERCENT AGED 16 UNEMPLOYED PERCENT AGED 25 WITHOUT HIGH SCHOOL DIPLOMA PERCENT AGED UNDER 18 OR OVER 64 PER CAPITA INCOME HARDSHIP INDEX\n", "------------- ----------------------- --------------------- ---------------------------- ---------------------------------- ----------------------------- ---------------------------------------------- ---------------------------------- -------------------- ----------------\n", "type int string real real real real real int int\n", "mins 1.0 NaN 0.3 3.3 4.7 2.5 13.5 8201.0 1.0\n", "maxs 77.0 NaN 15.8 56.5 35.9 54.8 51.5 88669.0 98.0\n", "sigma 22.3718573212 NaN 3.65898144135 11.457230913 7.49949670861 11.7465143511 7.28442108494 15196.4055413 28.6905556516\n", "zero_count 0 0 0 0 0 0 0 0 0\n", "missing_count 2 0 1 1 1 1 1 1 2" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Import and Parse crimes data\n", "\n", "Parse Progress: [##################################################] 100%\n", "Imported /Users/anqi_fu/Documents/workspace/h2o-3/smalldata/chicago/chicagoCrimes10k.csv.zip . Parsed 9,999 rows and 22 cols\n", "Rows: 9,999 Cols: 22\n", "\n", "Chunk compression summary:\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td>chunk_type</td>\n", "<td>chunk_name</td>\n", "<td>count</td>\n", "<td>count_percentage</td>\n", "<td>size</td>\n", "<td>size_percentage</td></tr>\n", "<tr><td>C0L</td>\n", "<td>Constant Integers</td>\n", "<td>4</td>\n", "<td>4.5454545</td>\n", "<td> 320 B</td>\n", "<td>0.03695244</td></tr>\n", "<tr><td>C1</td>\n", "<td>1-Byte Integers</td>\n", "<td>32</td>\n", "<td>36.363636</td>\n", "<td> 80.2 KB</td>\n", "<td>9.488462</td></tr>\n", "<tr><td>C1N</td>\n", "<td>1-Byte Integers (w/o NAs)</td>\n", "<td>8</td>\n", "<td>9.090909</td>\n", "<td> 20.1 KB</td>\n", "<td>2.3721156</td></tr>\n", "<tr><td>C2</td>\n", "<td>2-Byte Integers</td>\n", "<td>16</td>\n", "<td>18.181818</td>\n", "<td> 79.2 KB</td>\n", "<td>9.362824</td></tr>\n", "<tr><td>C4</td>\n", "<td>4-Byte Integers</td>\n", "<td>12</td>\n", "<td>13.636364</td>\n", "<td> 118.0 KB</td>\n", "<td>13.950008</td></tr>\n", "<tr><td>CStr</td>\n", "<td>String</td>\n", "<td>8</td>\n", "<td>9.090909</td>\n", "<td> 391.1 KB</td>\n", "<td>46.252445</td></tr>\n", "<tr><td>C8D</td>\n", "<td>64-bit Reals</td>\n", "<td>8</td>\n", "<td>9.090909</td>\n", "<td> 156.8 KB</td>\n", "<td>18.537191</td></tr></table></div>" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ -------- -----------------\n", "C0L Constant Integers 4 4.54545 320 B 0.0369524\n", "C1 1-Byte Integers 32 36.3636 80.2 KB 9.48846\n", "C1N 1-Byte Integers (w/o NAs) 8 9.09091 20.1 KB 2.37212\n", "C2 2-Byte Integers 16 18.1818 79.2 KB 9.36282\n", "C4 4-Byte Integers 12 13.6364 118.0 KB 13.95\n", "CStr String 8 9.09091 391.1 KB 46.2524\n", "C8D 64-bit Reals 8 9.09091 156.8 KB 18.5372" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td></td>\n", "<td>size</td>\n", "<td>number_of_rows</td>\n", "<td>number_of_chunks_per_column</td>\n", "<td>number_of_chunks</td></tr>\n", "<tr><td>172.16.2.17:54321</td>\n", "<td> 845.7 KB</td>\n", "<td>9999.0</td>\n", "<td>4.0</td>\n", "<td>88.0</td></tr>\n", "<tr><td>mean</td>\n", "<td> 845.7 KB</td>\n", "<td>9999.0</td>\n", "<td>4.0</td>\n", "<td>88.0</td></tr>\n", "<tr><td>min</td>\n", "<td> 845.7 KB</td>\n", "<td>9999.0</td>\n", "<td>4.0</td>\n", "<td>88.0</td></tr>\n", "<tr><td>max</td>\n", "<td> 845.7 KB</td>\n", "<td>9999.0</td>\n", "<td>4.0</td>\n", "<td>88.0</td></tr>\n", "<tr><td>stddev</td>\n", "<td> 0 B</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td></tr>\n", "<tr><td>total</td>\n", "<td> 845.7 KB</td>\n", "<td>9999.0</td>\n", "<td>4.0</td>\n", "<td>88.0</td></tr></table></div>" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- -------- ---------------- ----------------------------- ------------------\n", "172.16.2.17:54321 845.7 KB 9999 4 88\n", "mean 845.7 KB 9999 4 88\n", "min 845.7 KB 9999 4 88\n", "max 845.7 KB 9999 4 88\n", "stddev 0 B 0 0 0\n", "total 845.7 KB 9999 4 88" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Column-by-Column Summary:\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td></td>\n", "<td>ID</td>\n", "<td>Case Number</td>\n", "<td>Date</td>\n", "<td>Block</td>\n", "<td>IUCR</td>\n", "<td>Primary Type</td>\n", "<td>Description</td>\n", "<td>Location Description</td>\n", "<td>Arrest</td>\n", "<td>Domestic</td>\n", "<td>Beat</td>\n", "<td>District</td>\n", "<td>Ward</td>\n", "<td>Community Area</td>\n", "<td>FBI Code</td>\n", "<td>X Coordinate</td>\n", "<td>Y Coordinate</td>\n", "<td>Year</td>\n", "<td>Updated On</td>\n", "<td>Latitude</td>\n", "<td>Longitude</td>\n", "<td>Location</td></tr>\n", "<tr><td>type</td>\n", "<td>int</td>\n", "<td>string</td>\n", "<td>string</td>\n", "<td>enum</td>\n", "<td>int</td>\n", "<td>enum</td>\n", "<td>enum</td>\n", "<td>enum</td>\n", "<td>enum</td>\n", "<td>enum</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>enum</td>\n", "<td>real</td>\n", "<td>real</td>\n", "<td>enum</td></tr>\n", "<tr><td>mins</td>\n", "<td>21735.0</td>\n", "<td>NaN</td>\n", "<td>NaN</td>\n", "<td>0.0</td>\n", "<td>110.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>111.0</td>\n", "<td>1.0</td>\n", "<td>1.0</td>\n", "<td>1.0</td>\n", "<td>2.0</td>\n", "<td>1100317.0</td>\n", "<td>1814255.0</td>\n", "<td>2015.0</td>\n", "<td>0.0</td>\n", "<td>41.64507243</td>\n", "<td>-87.906463888</td>\n", "<td>0.0</td></tr>\n", "<tr><td>maxs</td>\n", "<td>9962898.0</td>\n", "<td>NaN</td>\n", "<td>NaN</td>\n", "<td>6517.0</td>\n", "<td>5131.0</td>\n", "<td>26.0</td>\n", "<td>198.0</td>\n", "<td>90.0</td>\n", "<td>1.0</td>\n", "<td>1.0</td>\n", "<td>2535.0</td>\n", "<td>25.0</td>\n", "<td>50.0</td>\n", "<td>77.0</td>\n", "<td>26.0</td>\n", "<td>1205069.0</td>\n", "<td>1951533.0</td>\n", "<td>2015.0</td>\n", "<td>32.0</td>\n", "<td>42.022646183</td>\n", "<td>-87.524773286</td>\n", "<td>8603.0</td></tr>\n", "<tr><td>sigma</td>\n", "<td>396787.564221</td>\n", "<td>NaN</td>\n", "<td>NaN</td>\n", "<td>1915.88517194</td>\n", "<td>927.751435583</td>\n", "<td>9.16241735944</td>\n", "<td>60.1059382029</td>\n", "<td>25.5963972463</td>\n", "<td>0.455083515588</td>\n", "<td>0.35934414686</td>\n", "<td>695.76029875</td>\n", "<td>6.94547493301</td>\n", "<td>13.6495661144</td>\n", "<td>21.2748762223</td>\n", "<td>7.57423857911</td>\n", "<td>16496.4493681</td>\n", "<td>31274.0163199</td>\n", "<td>0.0</td>\n", "<td>10.0824464345</td>\n", "<td>0.0860186579359</td>\n", "<td>0.0600357970653</td>\n", "<td>2469.64729385</td></tr>\n", "<tr><td>zero_count</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>3</td>\n", "<td>0</td>\n", "<td>11</td>\n", "<td>933</td>\n", "<td>19</td>\n", "<td>7071</td>\n", "<td>8476</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>603</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>1</td></tr>\n", "<tr><td>missing_count</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>419</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>6</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>162</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>2557</td>\n", "<td>162</td>\n", "<td>162</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>162</td>\n", "<td>162</td>\n", "<td>162</td></tr></table></div>" ], "text/plain": [ " ID Case Number Date Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location\n", "------------- ------------- ------------- ------ ------------- ------------- -------------- ------------- ---------------------- -------------- ------------- ------------ ------------- ------------- ---------------- ------------- -------------- -------------- ------ ------------- --------------- --------------- -------------\n", "type int string string enum int enum enum enum enum enum int int int int int int int int enum real real enum\n", "mins 21735.0 NaN NaN 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 2015.0 0.0 41.64507243 -87.906463888 0.0\n", "maxs 9962898.0 NaN NaN 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 2015.0 32.0 42.022646183 -87.524773286 8603.0\n", "sigma 396787.564221 NaN NaN 1915.88517194 927.751435583 9.16241735944 60.1059382029 25.5963972463 0.455083515588 0.35934414686 695.76029875 6.94547493301 13.6495661144 21.2748762223 7.57423857911 16496.4493681 31274.0163199 0.0 10.0824464345 0.0860186579359 0.0600357970653 2469.64729385\n", "zero_count 0 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1\n", "missing_count 0 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "weather_path = h2o.locate(\"smalldata/chicago/chicagoAllWeather.csv\")\n", "census_path = h2o.locate(\"smalldata/chicago/chicagoCensus.csv\")\n", "crimes_path = h2o.locate(\"smalldata/chicago/chicagoCrimes10k.csv.zip\")\n", "\n", "print \"Import and Parse weather data\"\n", "weather = h2o.import_frame(path=weather_path)\n", "weather.drop(\"date\")\n", "weather.describe()\n", "\n", "print \"Import and Parse census data\"\n", "census = h2o.import_frame(path=census_path)\n", "census.describe()\n", "\n", "print \"Import and Parse crimes data\"\n", "crimes = h2o.import_frame(path=crimes_path)\n", "crimes.describe()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows: 9,999 Cols: 27\n", "\n", "Chunk compression summary:\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td>chunk_type</td>\n", "<td>chunk_name</td>\n", "<td>count</td>\n", "<td>count_percentage</td>\n", "<td>size</td>\n", "<td>size_percentage</td></tr>\n", "<tr><td>C0L</td>\n", "<td>Constant Integers</td>\n", "<td>9</td>\n", "<td>8.333334</td>\n", "<td> 720 B</td>\n", "<td>0.10067465</td></tr>\n", "<tr><td>C1</td>\n", "<td>1-Byte Integers</td>\n", "<td>32</td>\n", "<td>29.62963</td>\n", "<td> 80.2 KB</td>\n", "<td>11.489216</td></tr>\n", "<tr><td>C1N</td>\n", "<td>1-Byte Integers (w/o NAs)</td>\n", "<td>23</td>\n", "<td>21.296297</td>\n", "<td> 57.9 KB</td>\n", "<td>8.29671</td></tr>\n", "<tr><td>C2</td>\n", "<td>2-Byte Integers</td>\n", "<td>16</td>\n", "<td>14.814815</td>\n", "<td> 79.2 KB</td>\n", "<td>11.337085</td></tr>\n", "<tr><td>C4</td>\n", "<td>4-Byte Integers</td>\n", "<td>12</td>\n", "<td>11.111112</td>\n", "<td> 118.0 KB</td>\n", "<td>16.891531</td></tr>\n", "<tr><td>C8</td>\n", "<td>64-bit Integers</td>\n", "<td>4</td>\n", "<td>3.7037036</td>\n", "<td> 78.4 KB</td>\n", "<td>11.222987</td></tr>\n", "<tr><td>CStr</td>\n", "<td>String</td>\n", "<td>4</td>\n", "<td>3.7037036</td>\n", "<td> 127.2 KB</td>\n", "<td>18.215822</td></tr>\n", "<tr><td>C8D</td>\n", "<td>64-bit Reals</td>\n", "<td>8</td>\n", "<td>7.4074073</td>\n", "<td> 156.8 KB</td>\n", "<td>22.445974</td></tr></table></div>" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ -------- -----------------\n", "C0L Constant Integers 9 8.33333 720 B 0.100675\n", "C1 1-Byte Integers 32 29.6296 80.2 KB 11.4892\n", "C1N 1-Byte Integers (w/o NAs) 23 21.2963 57.9 KB 8.29671\n", "C2 2-Byte Integers 16 14.8148 79.2 KB 11.3371\n", "C4 4-Byte Integers 12 11.1111 118.0 KB 16.8915\n", "C8 64-bit Integers 4 3.7037 78.4 KB 11.223\n", "CStr String 4 3.7037 127.2 KB 18.2158\n", "C8D 64-bit Reals 8 7.40741 156.8 KB 22.446" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td></td>\n", "<td>size</td>\n", "<td>number_of_rows</td>\n", "<td>number_of_chunks_per_column</td>\n", "<td>number_of_chunks</td></tr>\n", "<tr><td>172.16.2.17:54321</td>\n", "<td> 698.4 KB</td>\n", "<td>9999.0</td>\n", "<td>4.0</td>\n", "<td>108.0</td></tr>\n", "<tr><td>mean</td>\n", "<td> 698.4 KB</td>\n", "<td>9999.0</td>\n", "<td>4.0</td>\n", "<td>108.0</td></tr>\n", "<tr><td>min</td>\n", "<td> 698.4 KB</td>\n", "<td>9999.0</td>\n", "<td>4.0</td>\n", "<td>108.0</td></tr>\n", "<tr><td>max</td>\n", "<td> 698.4 KB</td>\n", "<td>9999.0</td>\n", "<td>4.0</td>\n", "<td>108.0</td></tr>\n", "<tr><td>stddev</td>\n", "<td> 0 B</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td></tr>\n", "<tr><td>total</td>\n", "<td> 698.4 KB</td>\n", "<td>9999.0</td>\n", "<td>4.0</td>\n", "<td>108.0</td></tr></table></div>" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- -------- ---------------- ----------------------------- ------------------\n", "172.16.2.17:54321 698.4 KB 9999 4 108\n", "mean 698.4 KB 9999 4 108\n", "min 698.4 KB 9999 4 108\n", "max 698.4 KB 9999 4 108\n", "stddev 0 B 0 0 0\n", "total 698.4 KB 9999 4 108" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Column-by-Column Summary:\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td></td>\n", "<td>ID</td>\n", "<td>Case Number</td>\n", "<td>Date</td>\n", "<td>Block</td>\n", "<td>IUCR</td>\n", "<td>Primary Type</td>\n", "<td>Description</td>\n", "<td>Location Description</td>\n", "<td>Arrest</td>\n", "<td>Domestic</td>\n", "<td>Beat</td>\n", "<td>District</td>\n", "<td>Ward</td>\n", "<td>Community Area</td>\n", "<td>FBI Code</td>\n", "<td>X Coordinate</td>\n", "<td>Y Coordinate</td>\n", "<td>Year</td>\n", "<td>Updated On</td>\n", "<td>Latitude</td>\n", "<td>Longitude</td>\n", "<td>Location</td>\n", "<td>Day</td>\n", "<td>Month</td>\n", "<td>WeekNum</td>\n", "<td>WeekDay</td>\n", "<td>HourOfDay</td></tr>\n", "<tr><td>type</td>\n", "<td>int</td>\n", "<td>string</td>\n", "<td>int</td>\n", "<td>enum</td>\n", "<td>int</td>\n", "<td>enum</td>\n", "<td>enum</td>\n", "<td>enum</td>\n", "<td>enum</td>\n", "<td>enum</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>enum</td>\n", "<td>real</td>\n", "<td>real</td>\n", "<td>enum</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>enum</td>\n", "<td>int</td></tr>\n", "<tr><td>mins</td>\n", "<td>21735.0</td>\n", "<td>NaN</td>\n", "<td>1.42203063e+12</td>\n", "<td>0.0</td>\n", "<td>110.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>111.0</td>\n", "<td>1.0</td>\n", "<td>1.0</td>\n", "<td>1.0</td>\n", "<td>2.0</td>\n", "<td>1100317.0</td>\n", "<td>1814255.0</td>\n", "<td>3915.0</td>\n", "<td>0.0</td>\n", "<td>41.64507243</td>\n", "<td>-87.906463888</td>\n", "<td>0.0</td>\n", "<td>1.0</td>\n", "<td>2.0</td>\n", "<td>4.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td></tr>\n", "<tr><td>maxs</td>\n", "<td>9962898.0</td>\n", "<td>NaN</td>\n", "<td>1.42346782e+12</td>\n", "<td>6517.0</td>\n", "<td>5131.0</td>\n", "<td>26.0</td>\n", "<td>198.0</td>\n", "<td>90.0</td>\n", "<td>1.0</td>\n", "<td>1.0</td>\n", "<td>2535.0</td>\n", "<td>25.0</td>\n", "<td>50.0</td>\n", "<td>77.0</td>\n", "<td>26.0</td>\n", "<td>1205069.0</td>\n", "<td>1951533.0</td>\n", "<td>3915.0</td>\n", "<td>32.0</td>\n", "<td>42.022646183</td>\n", "<td>-87.524773286</td>\n", "<td>8603.0</td>\n", "<td>31.0</td>\n", "<td>3.0</td>\n", "<td>6.0</td>\n", "<td>6.0</td>\n", "<td>23.0</td></tr>\n", "<tr><td>sigma</td>\n", "<td>396787.564221</td>\n", "<td>NaN</td>\n", "<td>433879245.188</td>\n", "<td>1915.88517194</td>\n", "<td>927.751435583</td>\n", "<td>9.16241735944</td>\n", "<td>60.1059382029</td>\n", "<td>25.5963972463</td>\n", "<td>0.455083515588</td>\n", "<td>0.35934414686</td>\n", "<td>695.76029875</td>\n", "<td>6.94547493301</td>\n", "<td>13.6495661144</td>\n", "<td>21.2748762223</td>\n", "<td>7.57423857911</td>\n", "<td>16496.4493681</td>\n", "<td>31274.0163199</td>\n", "<td>0.0</td>\n", "<td>10.0824464345</td>\n", "<td>0.0860186579359</td>\n", "<td>0.0600357970653</td>\n", "<td>2469.64729385</td>\n", "<td>11.1801043358</td>\n", "<td>0.493492406787</td>\n", "<td>0.738929830409</td>\n", "<td>1.93284056432</td>\n", "<td>6.47321735807</td></tr>\n", "<tr><td>zero_count</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>3</td>\n", "<td>0</td>\n", "<td>11</td>\n", "<td>933</td>\n", "<td>19</td>\n", "<td>7071</td>\n", "<td>8476</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>603</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>1</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>1038</td>\n", "<td>374</td></tr>\n", "<tr><td>missing_count</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>419</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>6</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>162</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>2557</td>\n", "<td>162</td>\n", "<td>162</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>162</td>\n", "<td>162</td>\n", "<td>162</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td></tr></table></div>" ], "text/plain": [ " ID Case Number Date Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay\n", "------------- ------------- ------------- -------------- ------------- ------------- -------------- ------------- ---------------------- -------------- ------------- ------------ ------------- ------------- ---------------- ------------- -------------- -------------- ------ ------------- --------------- --------------- ------------- ------------- -------------- -------------- ------------- -------------\n", "type int string int enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int\n", "mins 21735.0 NaN 1.42203063e+12 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 3915.0 0.0 41.64507243 -87.906463888 0.0 1.0 2.0 4.0 0.0 0.0\n", "maxs 9962898.0 NaN 1.42346782e+12 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 3915.0 32.0 42.022646183 -87.524773286 8603.0 31.0 3.0 6.0 6.0 23.0\n", "sigma 396787.564221 NaN 433879245.188 1915.88517194 927.751435583 9.16241735944 60.1059382029 25.5963972463 0.455083515588 0.35934414686 695.76029875 6.94547493301 13.6495661144 21.2748762223 7.57423857911 16496.4493681 31274.0163199 0.0 10.0824464345 0.0860186579359 0.0600357970653 2469.64729385 11.1801043358 0.493492406787 0.738929830409 1.93284056432 6.47321735807\n", "zero_count 0 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374\n", "missing_count 0 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Rows: 9,999 Cols: 28\n", "\n", "Chunk compression summary:\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td>chunk_type</td>\n", "<td>chunk_name</td>\n", "<td>count</td>\n", "<td>count_percentage</td>\n", "<td>size</td>\n", "<td>size_percentage</td></tr>\n", "<tr><td>C0L</td>\n", "<td>Constant Integers</td>\n", "<td>13</td>\n", "<td>11.607142</td>\n", "<td> 1.0 KB</td>\n", "<td>0.16332634</td></tr>\n", "<tr><td>CBS</td>\n", "<td>Bits</td>\n", "<td>4</td>\n", "<td>3.5714288</td>\n", "<td> 1.5 KB</td>\n", "<td>0.2404352</td></tr>\n", "<tr><td>C1</td>\n", "<td>1-Byte Integers</td>\n", "<td>32</td>\n", "<td>28.57143</td>\n", "<td> 80.2 KB</td>\n", "<td>12.9040365</td></tr>\n", "<tr><td>C1N</td>\n", "<td>1-Byte Integers (w/o NAs)</td>\n", "<td>23</td>\n", "<td>20.535715</td>\n", "<td> 57.9 KB</td>\n", "<td>9.318395</td></tr>\n", "<tr><td>C2</td>\n", "<td>2-Byte Integers</td>\n", "<td>16</td>\n", "<td>14.285715</td>\n", "<td> 79.2 KB</td>\n", "<td>12.733171</td></tr>\n", "<tr><td>C4</td>\n", "<td>4-Byte Integers</td>\n", "<td>12</td>\n", "<td>10.714286</td>\n", "<td> 118.0 KB</td>\n", "<td>18.97161</td></tr>\n", "<tr><td>CStr</td>\n", "<td>String</td>\n", "<td>4</td>\n", "<td>3.5714288</td>\n", "<td> 127.2 KB</td>\n", "<td>20.458979</td></tr>\n", "<tr><td>C8D</td>\n", "<td>64-bit Reals</td>\n", "<td>8</td>\n", "<td>7.1428576</td>\n", "<td> 156.8 KB</td>\n", "<td>25.210047</td></tr></table></div>" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ -------- -----------------\n", "C0L Constant Integers 13 11.6071 1.0 KB 0.163326\n", "CBS Bits 4 3.57143 1.5 KB 0.240435\n", "C1 1-Byte Integers 32 28.5714 80.2 KB 12.904\n", "C1N 1-Byte Integers (w/o NAs) 23 20.5357 57.9 KB 9.3184\n", "C2 2-Byte Integers 16 14.2857 79.2 KB 12.7332\n", "C4 4-Byte Integers 12 10.7143 118.0 KB 18.9716\n", "CStr String 4 3.57143 127.2 KB 20.459\n", "C8D 64-bit Reals 8 7.14286 156.8 KB 25.21" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td></td>\n", "<td>size</td>\n", "<td>number_of_rows</td>\n", "<td>number_of_chunks_per_column</td>\n", "<td>number_of_chunks</td></tr>\n", "<tr><td>172.16.2.17:54321</td>\n", "<td> 621.8 KB</td>\n", "<td>9999.0</td>\n", "<td>4.0</td>\n", "<td>112.0</td></tr>\n", "<tr><td>mean</td>\n", "<td> 621.8 KB</td>\n", "<td>9999.0</td>\n", "<td>4.0</td>\n", "<td>112.0</td></tr>\n", "<tr><td>min</td>\n", "<td> 621.8 KB</td>\n", "<td>9999.0</td>\n", "<td>4.0</td>\n", "<td>112.0</td></tr>\n", "<tr><td>max</td>\n", "<td> 621.8 KB</td>\n", "<td>9999.0</td>\n", "<td>4.0</td>\n", "<td>112.0</td></tr>\n", "<tr><td>stddev</td>\n", "<td> 0 B</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td></tr>\n", "<tr><td>total</td>\n", "<td> 621.8 KB</td>\n", "<td>9999.0</td>\n", "<td>4.0</td>\n", "<td>112.0</td></tr></table></div>" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- -------- ---------------- ----------------------------- ------------------\n", "172.16.2.17:54321 621.8 KB 9999 4 112\n", "mean 621.8 KB 9999 4 112\n", "min 621.8 KB 9999 4 112\n", "max 621.8 KB 9999 4 112\n", "stddev 0 B 0 0 0\n", "total 621.8 KB 9999 4 112" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Column-by-Column Summary:\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td></td>\n", "<td>ID</td>\n", "<td>Case Number</td>\n", "<td>Block</td>\n", "<td>IUCR</td>\n", "<td>Primary Type</td>\n", "<td>Description</td>\n", "<td>Location Description</td>\n", "<td>Arrest</td>\n", "<td>Domestic</td>\n", "<td>Beat</td>\n", "<td>District</td>\n", "<td>Ward</td>\n", "<td>Community Area</td>\n", "<td>FBI Code</td>\n", "<td>X Coordinate</td>\n", "<td>Y Coordinate</td>\n", "<td>Year</td>\n", "<td>Updated On</td>\n", "<td>Latitude</td>\n", "<td>Longitude</td>\n", "<td>Location</td>\n", "<td>Day</td>\n", "<td>Month</td>\n", "<td>WeekNum</td>\n", "<td>WeekDay</td>\n", "<td>HourOfDay</td>\n", "<td>Weekend</td>\n", "<td>Season</td></tr>\n", "<tr><td>type</td>\n", "<td>int</td>\n", "<td>string</td>\n", "<td>enum</td>\n", "<td>int</td>\n", "<td>enum</td>\n", "<td>enum</td>\n", "<td>enum</td>\n", "<td>enum</td>\n", "<td>enum</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>enum</td>\n", "<td>real</td>\n", "<td>real</td>\n", "<td>enum</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>enum</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>enum</td></tr>\n", "<tr><td>mins</td>\n", "<td>21735.0</td>\n", "<td>NaN</td>\n", "<td>0.0</td>\n", "<td>110.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>111.0</td>\n", "<td>1.0</td>\n", "<td>1.0</td>\n", "<td>1.0</td>\n", "<td>2.0</td>\n", "<td>1100317.0</td>\n", "<td>1814255.0</td>\n", "<td>3915.0</td>\n", "<td>0.0</td>\n", "<td>41.64507243</td>\n", "<td>-87.906463888</td>\n", "<td>0.0</td>\n", "<td>1.0</td>\n", "<td>2.0</td>\n", "<td>4.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td></tr>\n", "<tr><td>maxs</td>\n", "<td>9962898.0</td>\n", "<td>NaN</td>\n", "<td>6517.0</td>\n", "<td>5131.0</td>\n", "<td>26.0</td>\n", "<td>198.0</td>\n", "<td>90.0</td>\n", "<td>1.0</td>\n", "<td>1.0</td>\n", "<td>2535.0</td>\n", "<td>25.0</td>\n", "<td>50.0</td>\n", "<td>77.0</td>\n", "<td>26.0</td>\n", "<td>1205069.0</td>\n", "<td>1951533.0</td>\n", "<td>3915.0</td>\n", "<td>32.0</td>\n", "<td>42.022646183</td>\n", "<td>-87.524773286</td>\n", "<td>8603.0</td>\n", "<td>31.0</td>\n", "<td>3.0</td>\n", "<td>6.0</td>\n", "<td>6.0</td>\n", "<td>23.0</td>\n", "<td>1.0</td>\n", "<td>1.0</td></tr>\n", "<tr><td>sigma</td>\n", "<td>396787.564221</td>\n", "<td>NaN</td>\n", "<td>1915.88517194</td>\n", "<td>927.751435583</td>\n", "<td>9.16241735944</td>\n", "<td>60.1059382029</td>\n", "<td>25.5963972463</td>\n", "<td>0.455083515588</td>\n", "<td>0.35934414686</td>\n", "<td>695.76029875</td>\n", "<td>6.94547493301</td>\n", "<td>13.6495661144</td>\n", "<td>21.2748762223</td>\n", "<td>7.57423857911</td>\n", "<td>16496.4493681</td>\n", "<td>31274.0163199</td>\n", "<td>0.0</td>\n", "<td>10.0824464345</td>\n", "<td>0.0860186579359</td>\n", "<td>0.0600357970653</td>\n", "<td>2469.64729385</td>\n", "<td>11.1801043358</td>\n", "<td>0.493492406787</td>\n", "<td>0.738929830409</td>\n", "<td>1.93284056432</td>\n", "<td>6.47321735807</td>\n", "<td>0.365802434041</td>\n", "<td>0.493492406787</td></tr>\n", "<tr><td>zero_count</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>3</td>\n", "<td>0</td>\n", "<td>11</td>\n", "<td>933</td>\n", "<td>19</td>\n", "<td>7071</td>\n", "<td>8476</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>603</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>1</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>1038</td>\n", "<td>374</td>\n", "<td>8408</td>\n", "<td>5805</td></tr>\n", "<tr><td>missing_count</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>419</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>6</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>162</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>2557</td>\n", "<td>162</td>\n", "<td>162</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>162</td>\n", "<td>162</td>\n", "<td>162</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td></tr></table></div>" ], "text/plain": [ " ID Case Number Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay Weekend Season\n", "------------- ------------- ------------- ------------- ------------- -------------- ------------- ---------------------- -------------- ------------- ------------ ------------- ------------- ---------------- ------------- -------------- -------------- ------ ------------- --------------- --------------- ------------- ------------- -------------- -------------- ------------- ------------- -------------- --------------\n", "type int string enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int int enum\n", "mins 21735.0 NaN 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 3915.0 0.0 41.64507243 -87.906463888 0.0 1.0 2.0 4.0 0.0 0.0 0.0 0.0\n", "maxs 9962898.0 NaN 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 3915.0 32.0 42.022646183 -87.524773286 8603.0 31.0 3.0 6.0 6.0 23.0 1.0 1.0\n", "sigma 396787.564221 NaN 1915.88517194 927.751435583 9.16241735944 60.1059382029 25.5963972463 0.455083515588 0.35934414686 695.76029875 6.94547493301 13.6495661144 21.2748762223 7.57423857911 16496.4493681 31274.0163199 0.0 10.0824464345 0.0860186579359 0.0600357970653 2469.64729385 11.1801043358 0.493492406787 0.738929830409 1.93284056432 6.47321735807 0.365802434041 0.493492406787\n", "zero_count 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374 8408 5805\n", "missing_count 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0 0 0" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def refine_date_col(data, col, pattern):\n", " data[col] = data[col].as_date(pattern)\n", " data[\"Day\"] = data[col].day()\n", " data[\"Month\"] = data[col].month() + 1 # Since H2O indexes from 0\n", " data[\"Year\"] = data[col].year() + 1900 # Start of epoch is 1900\n", " data[\"WeekNum\"] = data[col].week()\n", " data[\"WeekDay\"] = data[col].dayOfWeek()\n", " data[\"HourOfDay\"] = data[col].hour()\n", " \n", " data.describe() # HACK: Force evaluation before ifelse and cut. See PUBDEV-1425.\n", " \n", " # Create weekend and season cols\n", " # Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.\n", " # data[\"Weekend\"] = [1 if x in (\"Sun\", \"Sat\") else 0 for x in data[\"WeekDay\"]]\n", " data[\"Weekend\"] = h2o.ifelse(data[\"WeekDay\"] == \"Sun\" or data[\"WeekDay\"] == \"Sat\", 1, 0)[0]\n", " data[\"Season\"] = data[\"Month\"].cut([0, 2, 5, 7, 10, 12], [\"Winter\", \"Spring\", \"Summer\", \"Autumn\", \"Winter\"])\n", " \n", "refine_date_col(crimes, \"Date\", \"%m/%d/%Y %I:%M:%S %p\")\n", "crimes = crimes.drop(\"Date\")\n", "crimes.describe()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "ename": "EnvironmentError", "evalue": "h2o-py got an unexpected HTTP status code:\n 412 Precondition Failed (method = POST; url = http://localhost:54321/99/Rapids). \ndetailed error messages: water.DException$DistributedException: from /172.16.2.17:54321; by class water.rapids.ASTMerge$MergeSet$MakeHash; class water.exceptions.H2OIllegalArgumentException: unimplemented", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mEnvironmentError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m<ipython-input-5-e946a6af6204>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mweather\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"day\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0m_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Day\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mweather\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"year\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0m_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"Year\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mcrimes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmerge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcensus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mallLeft\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mallRite\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0mcrimes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmerge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mweather\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mallLeft\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mallRite\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/frame.pyc\u001b[0m in \u001b[0;36mmerge\u001b[0;34m(self, other, allLeft, allRite)\u001b[0m\n\u001b[1;32m 1022\u001b[0m \u001b[0mexpr2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"(, \"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mexpr\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\" (del %\"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mlkey\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\" #0) (del %\"\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mrkey\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\" #0) )\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1023\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1024\u001b[0;31m \u001b[0mh2o\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrapids\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpr2\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# merge in h2o\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1025\u001b[0m \u001b[0;31m# Make backing H2OVecs for the remote h2o vecs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1026\u001b[0m \u001b[0mj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh2o\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtmp_key\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# Fetch the frame as JSON\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/h2o.pyc\u001b[0m in \u001b[0;36mrapids\u001b[0;34m(expr)\u001b[0m\n\u001b[1;32m 487\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mThe\u001b[0m \u001b[0mJSON\u001b[0m \u001b[0mresponse\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mRapids\u001b[0m \u001b[0mexecution\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 488\u001b[0m \"\"\"\n\u001b[0;32m--> 489\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mH2OConnection\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpost_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Rapids\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mast\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murllib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquote\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_rest_version\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m99\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 490\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'error'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 491\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mEnvironmentError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"rapids expression not evaluated: {0}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'error'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc\u001b[0m in \u001b[0;36mpost_json\u001b[0;34m(url_suffix, file_upload_info, **kwargs)\u001b[0m\n\u001b[1;32m 360\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0m__H2OCONN__\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 361\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"No h2o connection. Did you run `h2o.init()` ?\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 362\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m__H2OCONN__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_rest_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl_suffix\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"POST\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_upload_info\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 363\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_rest_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl_suffix\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_upload_info\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc\u001b[0m in \u001b[0;36m_rest_json\u001b[0;34m(self, url_suffix, method, file_upload_info, **kwargs)\u001b[0m\n\u001b[1;32m 363\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_rest_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl_suffix\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_upload_info\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 365\u001b[0;31m \u001b[0mraw_txt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_do_raw_rest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl_suffix\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_upload_info\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 366\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_process_tables\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_txt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 367\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc\u001b[0m in \u001b[0;36m_do_raw_rest\u001b[0;34m(self, url_suffix, method, file_upload_info, **kwargs)\u001b[0m\n\u001b[1;32m 429\u001b[0m raise EnvironmentError((\"h2o-py got an unexpected HTTP status code:\\n {} {} (method = {}; url = {}). \\n\"+ \\\n\u001b[1;32m 430\u001b[0m \"detailed error messages: {}\")\n\u001b[0;32m--> 431\u001b[0;31m .format(http_result.status_code,http_result.reason,method,url,detailed_error_msgs))\n\u001b[0m\u001b[1;32m 432\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 433\u001b[0m \u001b[0;31m# TODO: is.logging? -> write to logs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mEnvironmentError\u001b[0m: h2o-py got an unexpected HTTP status code:\n 412 Precondition Failed (method = POST; url = http://localhost:54321/99/Rapids). \ndetailed error messages: water.DException$DistributedException: from /172.16.2.17:54321; by class water.rapids.ASTMerge$MergeSet$MakeHash; class water.exceptions.H2OIllegalArgumentException: unimplemented" ] } ], "source": [ "# Merge crimes data with weather and census\n", "census[\"Community Area Number\"]._name = \"Community Area\"\n", "weather[\"month\"]._name = \"Month\"\n", "weather[\"day\"] ._name = \"Day\"\n", "weather[\"year\"] ._name = \"Year\"\n", "crimes.merge(census, allLeft=True, allRite=False)\n", "crimes.merge(weather, allLeft=True, allRite=False)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "ename": "NameError", "evalue": "name 'data' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m<ipython-input-12-347776b381b3>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Create test/train split\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdata_split\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh2o\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit_frame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mratios\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m0.8\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m0.2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mtrain\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata_split\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtest\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata_split\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'data' is not defined" ] } ], "source": [ "# Create test/train split\n", "data_split = h2o.split_frame(data, ratios = [0.8,0.2])\n", "train = data_split[1]\n", "test = data_split[2]\n", "\n", "# Simple GBM - Predict Arrest\n", "data_gbm = h2o.gbm(x =train.drop(\"Arrest\"),\n", " y =train [\"Arrest\"],\n", " validation_x =test .drop(\"Arrest\"),\n", " validation_y =test [\"Arrest\"],\n", " ntrees =10,\n", " max_depth =6,\n", " distribution =\"bernoulli\")\n", "\n", "# Simple Deep Learning\n", "data_dl = h2o.deeplearning(x =train.drop(\"Arrest\"),\n", " y =train [\"Arrest\"],\n", " validation_x =test .drop(\"Arrest\"),\n", " validation_y =test [\"Arrest\"],\n", " variable_importances=True,\n", " loss =\"Automatic\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "ename": "NameError", "evalue": "name 'data_gbm' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m<ipython-input-2-f7c2ab3a3e26>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# GBM performance on train/test data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mtrain_auc_gbm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata_gbm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel_performance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mtest_auc_gbm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata_gbm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel_performance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mauc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;31m# Deep Learning performance on train/test data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'data_gbm' is not defined" ] } ], "source": [ "# GBM performance on train/test data\n", "train_auc_gbm = data_gbm.model_performance(train).auc()\n", "test_auc_gbm = data_gbm.model_performance(test) .auc()\n", "\n", "# Deep Learning performance on train/test data\n", "train_auc_dl = data_dl.model_performance(train).auc()\n", "test_auc_dl = data_dl.model_performance(test) .auc()\n", "\n", "# Make a pretty HTML table printout of the results\n", "header = [\"Model\", \"AUC Train\", \"AUC Test\"]\n", "table = [\n", " [\"GBM\", train_auc_gbm, test_auc_gbm],\n", " [\"DL \", train_auc_dl, test_auc_dl]\n", " ]\n", "h2o.H2ODisplay(table, header)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Parse Progress: [##################################################] 100%\n", "Uploaded py634b18a9-7e84-40ca-b265-b2fe43e064aa into cluster with 2 rows and 10 cols\n", "Rows: 2 Cols: 16\n", "\n", "Chunk compression summary:\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td>chunk_type</td>\n", "<td>chunk_name</td>\n", "<td>count</td>\n", "<td>count_percentage</td>\n", "<td>size</td>\n", "<td>size_percentage</td></tr>\n", "<tr><td>C0L</td>\n", "<td>Constant Integers</td>\n", "<td>7</td>\n", "<td>43.75</td>\n", "<td> 560 B</td>\n", "<td>43.818466</td></tr>\n", "<tr><td>C1N</td>\n", "<td>1-Byte Integers (w/o NAs)</td>\n", "<td>4</td>\n", "<td>25.0</td>\n", "<td> 280 B</td>\n", "<td>21.909233</td></tr>\n", "<tr><td>C2</td>\n", "<td>2-Byte Integers</td>\n", "<td>2</td>\n", "<td>12.5</td>\n", "<td> 144 B</td>\n", "<td>11.267606</td></tr>\n", "<tr><td>C2S</td>\n", "<td>2-Byte Fractions</td>\n", "<td>1</td>\n", "<td>6.25</td>\n", "<td> 88 B</td>\n", "<td>6.885759</td></tr>\n", "<tr><td>CStr</td>\n", "<td>String</td>\n", "<td>2</td>\n", "<td>12.5</td>\n", "<td> 206 B</td>\n", "<td>16.118937</td></tr></table></div>" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ ------ -----------------\n", "C0L Constant Integers 7 43.75 560 B 43.8185\n", "C1N 1-Byte Integers (w/o NAs) 4 25 280 B 21.9092\n", "C2 2-Byte Integers 2 12.5 144 B 11.2676\n", "C2S 2-Byte Fractions 1 6.25 88 B 6.88576\n", "CStr String 2 12.5 206 B 16.1189" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary:\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td></td>\n", "<td>size</td>\n", "<td>number_of_rows</td>\n", "<td>number_of_chunks_per_column</td>\n", "<td>number_of_chunks</td></tr>\n", "<tr><td>172.16.2.17:54321</td>\n", "<td> 1.2 KB</td>\n", "<td>2.0</td>\n", "<td>1.0</td>\n", "<td>16.0</td></tr>\n", "<tr><td>mean</td>\n", "<td> 1.2 KB</td>\n", "<td>2.0</td>\n", "<td>1.0</td>\n", "<td>16.0</td></tr>\n", "<tr><td>min</td>\n", "<td> 1.2 KB</td>\n", "<td>2.0</td>\n", "<td>1.0</td>\n", "<td>16.0</td></tr>\n", "<tr><td>max</td>\n", "<td> 1.2 KB</td>\n", "<td>2.0</td>\n", "<td>1.0</td>\n", "<td>16.0</td></tr>\n", "<tr><td>stddev</td>\n", "<td> 0 B</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td></tr>\n", "<tr><td>total</td>\n", "<td> 1.2 KB</td>\n", "<td>2.0</td>\n", "<td>1.0</td>\n", "<td>16.0</td></tr></table></div>" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- ------ ---------------- ----------------------------- ------------------\n", "172.16.2.17:54321 1.2 KB 2 1 16\n", "mean 1.2 KB 2 1 16\n", "min 1.2 KB 2 1 16\n", "max 1.2 KB 2 1 16\n", "stddev 0 B 0 0 0\n", "total 1.2 KB 2 1 16" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Column-by-Column Summary:\n", "\n" ] }, { "data": { "text/html": [ "<div style=\"overflow:auto\"><table style=\"width:50%\"><tr><td></td>\n", "<td>Location.Description</td>\n", "<td>FBI.Code</td>\n", "<td>Primary.Type</td>\n", "<td>Community.Area</td>\n", "<td>District</td>\n", "<td>Beat</td>\n", "<td>Domestic</td>\n", "<td>IUCR</td>\n", "<td>Date</td>\n", "<td>Ward</td>\n", "<td>Day</td>\n", "<td>Month</td>\n", "<td>Year</td>\n", "<td>WeekNum</td>\n", "<td>WeekDay</td>\n", "<td>HourOfDay</td></tr>\n", "<tr><td>type</td>\n", "<td>string</td>\n", "<td>int</td>\n", "<td>string</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>enum</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>int</td>\n", "<td>enum</td>\n", "<td>int</td></tr>\n", "<tr><td>mins</td>\n", "<td>NaN</td>\n", "<td>11.0</td>\n", "<td>NaN</td>\n", "<td>46.0</td>\n", "<td>4.0</td>\n", "<td>422.0</td>\n", "<td>0.0</td>\n", "<td>1150.0</td>\n", "<td>1.423465239e+12</td>\n", "<td>7.0</td>\n", "<td>8.0</td>\n", "<td>3.0</td>\n", "<td>3915.0</td>\n", "<td>6.0</td>\n", "<td>6.0</td>\n", "<td>23.0</td></tr>\n", "<tr><td>maxs</td>\n", "<td>NaN</td>\n", "<td>18.0</td>\n", "<td>NaN</td>\n", "<td>63.0</td>\n", "<td>9.0</td>\n", "<td>923.0</td>\n", "<td>0.0</td>\n", "<td>1811.0</td>\n", "<td>1.423467838e+12</td>\n", "<td>14.0</td>\n", "<td>8.0</td>\n", "<td>3.0</td>\n", "<td>3915.0</td>\n", "<td>6.0</td>\n", "<td>6.0</td>\n", "<td>23.0</td></tr>\n", "<tr><td>sigma</td>\n", "<td>NaN</td>\n", "<td>4.94974746831</td>\n", "<td>NaN</td>\n", "<td>12.0208152802</td>\n", "<td>3.53553390593</td>\n", "<td>354.260497374</td>\n", "<td>0.0</td>\n", "<td>467.397582364</td>\n", "<td>1837770.5243</td>\n", "<td>4.94974746831</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td>\n", "<td>0.0</td></tr>\n", "<tr><td>zero_count</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>2</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td></tr>\n", "<tr><td>missing_count</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td>\n", "<td>0</td></tr></table></div>" ], "text/plain": [ " Location.Description FBI.Code Primary.Type Community.Area District Beat Domestic IUCR Date Ward Day Month Year WeekNum WeekDay HourOfDay\n", "------------- ---------------------- ------------- -------------- ---------------- ------------- ------------- ---------- ------------- --------------- ------------- ----- ------- ------ --------- --------- -----------\n", "type string int string int int int enum int int int int int int int enum int\n", "mins NaN 11.0 NaN 46.0 4.0 422.0 0.0 1150.0 1.423465239e+12 7.0 8.0 3.0 3915.0 6.0 6.0 23.0\n", "maxs NaN 18.0 NaN 63.0 9.0 923.0 0.0 1811.0 1.423467838e+12 14.0 8.0 3.0 3915.0 6.0 6.0 23.0\n", "sigma NaN 4.94974746831 NaN 12.0208152802 3.53553390593 354.260497374 0.0 467.397582364 1837770.5243 4.94974746831 0.0 0.0 0.0 0.0 0.0 0.0\n", "zero_count 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0\n", "missing_count 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0" ] }, "metadata": {}, "output_type": "display_data" }, { "ename": "EnvironmentError", "evalue": "h2o-py got an unexpected HTTP status code:\n 412 Precondition Failed (method = POST; url = http://localhost:54321/99/Rapids). \ndetailed error messages: Data vector is constant!", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mEnvironmentError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m<ipython-input-6-85bb7c75c897>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0;31m# Refine date column and merge with census data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 18\u001b[0;31m \u001b[0mrefine_date_col\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcrime_examples\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Date\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"%m/%d/%Y %I:%M:%S %p\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 19\u001b[0m \u001b[0mcrime_examples\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Date\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0mcrime_examples\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmerge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcensus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mallLeft\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mallRite\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m<ipython-input-4-c2702228f9f1>\u001b[0m in \u001b[0;36mrefine_date_col\u001b[0;34m(data, col, pattern)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;31m# data[\"Weekend\"] = h2o.ifelse(data[\"WeekDay\"] in (\"Sun\", \"Sat\"), 1, 0)[0]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Weekend\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh2o\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mifelse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"WeekDay\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"Sun\"\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"WeekDay\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"Sat\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Season\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Month\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcut\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m7\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m12\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"Winter\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Spring\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Summer\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Autumn\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Winter\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0mrefine_date_col\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcrimes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Date\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"%m/%d/%Y %I:%M:%S %p\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/frame.pyc\u001b[0m in \u001b[0;36mcut\u001b[0;34m(self, breaks, labels, include_lowest, right, dig_lab)\u001b[0m\n\u001b[1;32m 1256\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1257\u001b[0m \u001b[0mexpr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"(cut '{}' {} {} {} {} #{}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbreaks_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"%TRUE\"\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minclude_lowest\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m\"%FALSE\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"%TRUE\"\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mright\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m\"%FALSE\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdig_lab\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1258\u001b[0;31m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh2o\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrapids\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1259\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mH2OVec\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mExpr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mop\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mres\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"vec_ids\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"name\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlength\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mres\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"num_rows\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1260\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/h2o.pyc\u001b[0m in \u001b[0;36mrapids\u001b[0;34m(expr)\u001b[0m\n\u001b[1;32m 487\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mThe\u001b[0m \u001b[0mJSON\u001b[0m \u001b[0mresponse\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mRapids\u001b[0m \u001b[0mexecution\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 488\u001b[0m \"\"\"\n\u001b[0;32m--> 489\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mH2OConnection\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpost_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Rapids\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mast\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0murllib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquote\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexpr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_rest_version\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m99\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 490\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'error'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 491\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mEnvironmentError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"rapids expression not evaluated: {0}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'error'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc\u001b[0m in \u001b[0;36mpost_json\u001b[0;34m(url_suffix, file_upload_info, **kwargs)\u001b[0m\n\u001b[1;32m 360\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0m__H2OCONN__\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 361\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"No h2o connection. Did you run `h2o.init()` ?\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 362\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m__H2OCONN__\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_rest_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl_suffix\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"POST\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_upload_info\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 363\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_rest_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl_suffix\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_upload_info\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc\u001b[0m in \u001b[0;36m_rest_json\u001b[0;34m(self, url_suffix, method, file_upload_info, **kwargs)\u001b[0m\n\u001b[1;32m 363\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_rest_json\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl_suffix\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_upload_info\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 365\u001b[0;31m \u001b[0mraw_txt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_do_raw_rest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl_suffix\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfile_upload_info\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 366\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_process_tables\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_txt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 367\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc\u001b[0m in \u001b[0;36m_do_raw_rest\u001b[0;34m(self, url_suffix, method, file_upload_info, **kwargs)\u001b[0m\n\u001b[1;32m 429\u001b[0m raise EnvironmentError((\"h2o-py got an unexpected HTTP status code:\\n {} {} (method = {}; url = {}). \\n\"+ \\\n\u001b[1;32m 430\u001b[0m \"detailed error messages: {}\")\n\u001b[0;32m--> 431\u001b[0;31m .format(http_result.status_code,http_result.reason,method,url,detailed_error_msgs))\n\u001b[0m\u001b[1;32m 432\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 433\u001b[0m \u001b[0;31m# TODO: is.logging? -> write to logs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mEnvironmentError\u001b[0m: h2o-py got an unexpected HTTP status code:\n 412 Precondition Failed (method = POST; url = http://localhost:54321/99/Rapids). \ndetailed error messages: Data vector is constant!" ] } ], "source": [ "# Create new H2OFrame of crime observations\n", "examples = {\n", " \"Date\": [\"02/08/2015 11:43:58 PM\", \"02/08/2015 11:00:39 PM\"],\n", " \"IUCR\": [1811, 1150],\n", " \"Primary.Type\": [\"NARCOTICS\", \"DECEPTIVE PRACTICE\"],\n", " \"Location.Description\": [\"STREET\", \"RESIDENCE\"],\n", " \"Domestic\": [\"false\", \"false\"],\n", " \"Beat\": [422, 923],\n", " \"District\": [4, 9],\n", " \"Ward\": [7, 14],\n", " \"Community.Area\": [46, 63],\n", " \"FBI.Code\": [18, 11]\n", " }\n", "\n", "crime_examples = h2o.H2OFrame(python_obj = examples)\n", "\n", "# Refine date column and merge with census data\n", "refine_date_col(crime_examples, \"Date\", \"%m/%d/%Y %I:%M:%S %p\")\n", "crime_examples.drop(\"Date\")\n", "crime_examples.merge(census, allLeft=True, allRite=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Predict probability of arrest from new observations\n", "gbm_pred = data_gbm.predict(crime_examples)\n", "dl_pred = data_dl .predict(crime_examples)\n", "\n", "# TODO: Replace with a pretty HTML table\n", "gbm_pred.describe()\n", "dl_pred.describe()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }