{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import h2o\n",
"from h2o.estimators.gbm import H2OGradientBoostingEstimator\n",
"from h2o.estimators.deeplearning import H2ODeepLearningEstimator"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"
| H2O cluster uptime: | \n",
"1 hours 13 minutes 22 seconds 521 milliseconds |
\n",
"| H2O cluster version: | \n",
"3.7.0.99999 |
\n",
"| H2O cluster name: | \n",
"ludirehak |
\n",
"| H2O cluster total nodes: | \n",
"1 |
\n",
"| H2O cluster total free memory: | \n",
"3.24 GB |
\n",
"| H2O cluster total cores: | \n",
"8 |
\n",
"| H2O cluster allowed cores: | \n",
"8 |
\n",
"| H2O cluster healthy: | \n",
"True |
\n",
"| H2O Connection ip: | \n",
"127.0.0.1 |
\n",
"| H2O Connection port: | \n",
"54321 |
\n",
"| H2O Connection proxy: | \n",
"None |
\n",
"| Python Version: | \n",
"3.5.1 |
"
],
"text/plain": [
"------------------------------ ----------------------------------------------\n",
"H2O cluster uptime: 1 hours 13 minutes 22 seconds 521 milliseconds\n",
"H2O cluster version: 3.7.0.99999\n",
"H2O cluster name: ludirehak\n",
"H2O cluster total nodes: 1\n",
"H2O cluster total free memory: 3.24 GB\n",
"H2O cluster total cores: 8\n",
"H2O cluster allowed cores: 8\n",
"H2O cluster healthy: True\n",
"H2O Connection ip: 127.0.0.1\n",
"H2O Connection port: 54321\n",
"H2O Connection proxy:\n",
"Python Version: 3.5.1\n",
"------------------------------ ----------------------------------------------"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Connect to a cluster\n",
"h2o.init()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Import and Parse weather data\n",
"\n",
"Parse Progress: [##################################################] 100%\n",
"Rows:5,162 Cols:7\n",
"\n",
"Chunk compression summary: \n"
]
},
{
"data": {
"text/html": [
"| chunk_type | \n",
"chunk_name | \n",
"count | \n",
"count_percentage | \n",
"size | \n",
"size_percentage |
\n",
"| C0L | \n",
"Constant Integers | \n",
"19 | \n",
"8.225108 | \n",
" 1.5 KB | \n",
"3.3811588 |
\n",
"| C0D | \n",
"Constant Reals | \n",
"33 | \n",
"14.285715 | \n",
" 2.6 KB | \n",
"5.872539 |
\n",
"| C1 | \n",
"1-Byte Integers | \n",
"8 | \n",
"3.4632034 | \n",
" 1.7 KB | \n",
"3.9528418 |
\n",
"| C1N | \n",
"1-Byte Integers (w/o NAs) | \n",
"135 | \n",
"58.441555 | \n",
" 29.6 KB | \n",
"67.35625 |
\n",
"| C1S | \n",
"1-Byte Fractions | \n",
"36 | \n",
"15.584415 | \n",
" 8.5 KB | \n",
"19.437214 |
"
],
"text/plain": [
"chunk_type chunk_name count count_percentage size size_percentage\n",
"------------ ------------------------- ------- ------------------ ------- -----------------\n",
"C0L Constant Integers 19 8.22511 1.5 KB 3.38116\n",
"C0D Constant Reals 33 14.2857 2.6 KB 5.87254\n",
"C1 1-Byte Integers 8 3.4632 1.7 KB 3.95284\n",
"C1N 1-Byte Integers (w/o NAs) 135 58.4416 29.6 KB 67.3563\n",
"C1S 1-Byte Fractions 36 15.5844 8.5 KB 19.4372"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Frame distribution summary: \n"
]
},
{
"data": {
"text/html": [
" | \n",
"size | \n",
"number_of_rows | \n",
"number_of_chunks_per_column | \n",
"number_of_chunks |
\n",
"| 172.16.2.43:54321 | \n",
" 43.9 KB | \n",
"5162.0 | \n",
"33.0 | \n",
"231.0 |
\n",
"| mean | \n",
" 43.9 KB | \n",
"5162.0 | \n",
"33.0 | \n",
"231.0 |
\n",
"| min | \n",
" 43.9 KB | \n",
"5162.0 | \n",
"33.0 | \n",
"231.0 |
\n",
"| max | \n",
" 43.9 KB | \n",
"5162.0 | \n",
"33.0 | \n",
"231.0 |
\n",
"| stddev | \n",
" 0 B | \n",
"0.0 | \n",
"0.0 | \n",
"0.0 |
\n",
"| total | \n",
" 43.9 KB | \n",
"5162.0 | \n",
"33.0 | \n",
"231.0 |
"
],
"text/plain": [
" size number_of_rows number_of_chunks_per_column number_of_chunks\n",
"----------------- ------- ---------------- ----------------------------- ------------------\n",
"172.16.2.43:54321 43.9 KB 5162 33 231\n",
"mean 43.9 KB 5162 33 231\n",
"min 43.9 KB 5162 33 231\n",
"max 43.9 KB 5162 33 231\n",
"stddev 0 B 0 0 0\n",
"total 43.9 KB 5162 33 231"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"| | date | month | day | year | maxTemp | meanTemp | minTemp |
\n",
"| type | time | int | int | int | int | int | int |
\n",
"| mins | NaN | 1.0 | 1.0 | 2001.0 | -2.0 | -9.0 | -18.0 |
\n",
"| mean | 0.0 | 6.47442851607904 | 15.708252615265401 | 2007.5714839209609 | 58.871042920955524 | 50.31035152456788 | 41.4812584967955 |
\n",
"| maxs | NaN | 12.0 | 31.0 | 2015.0 | 103.0 | 93.0 | 82.0 |
\n",
"| sigma | -0.0 | 3.469051716937685 | 8.798951739966594 | 4.077340905700527 | 21.482977723685387 | 19.930239926608884 | 19.020729712312264 |
\n",
"| zeros | -5162 | 0 | 0 | 0 | 0 | 2 | 16 |
\n",
"| missing | 5162 | 0 | 0 | 0 | 13 | 13 | 13 |
\n",
"| 0 | nan | 1.0 | 1.0 | 2001.0 | 23.0 | 14.0 | 6.0 |
\n",
"| 1 | nan | 1.0 | 2.0 | 2001.0 | 18.0 | 12.0 | 6.0 |
\n",
"| 2 | nan | 1.0 | 3.0 | 2001.0 | 28.0 | 18.0 | 8.0 |
\n",
"| 3 | nan | 1.0 | 4.0 | 2001.0 | 30.0 | 24.0 | 19.0 |
\n",
"| 4 | nan | 1.0 | 5.0 | 2001.0 | 36.0 | 30.0 | 21.0 |
\n",
"| 5 | nan | 1.0 | 6.0 | 2001.0 | 33.0 | 26.0 | 19.0 |
\n",
"| 6 | nan | 1.0 | 7.0 | 2001.0 | 34.0 | 28.0 | 21.0 |
\n",
"| 7 | nan | 1.0 | 8.0 | 2001.0 | 26.0 | 20.0 | 14.0 |
\n",
"| 8 | nan | 1.0 | 9.0 | 2001.0 | 23.0 | 16.0 | 10.0 |
\n",
"| 9 | nan | 1.0 | 10.0 | 2001.0 | 34.0 | 26.0 | 19.0 |
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Import and Parse census data\n",
"\n",
"Parse Progress: [##################################################] 100%\n",
"Rows:79 Cols:9\n",
"\n",
"Chunk compression summary: \n"
]
},
{
"data": {
"text/html": [
"| chunk_type | \n",
"chunk_name | \n",
"count | \n",
"count_percentage | \n",
"size | \n",
"size_percentage |
\n",
"| C1 | \n",
"1-Byte Integers | \n",
"3 | \n",
"33.333336 | \n",
" 441 B | \n",
"22.546013 |
\n",
"| C1S | \n",
"1-Byte Fractions | \n",
"1 | \n",
"11.111112 | \n",
" 163 B | \n",
"8.333334 |
\n",
"| C2S | \n",
"2-Byte Fractions | \n",
"4 | \n",
"44.444447 | \n",
" 968 B | \n",
"49.488754 |
\n",
"| C4 | \n",
"4-Byte Integers | \n",
"1 | \n",
"11.111112 | \n",
" 384 B | \n",
"19.6319 |
"
],
"text/plain": [
"chunk_type chunk_name count count_percentage size size_percentage\n",
"------------ ---------------- ------- ------------------ ------ -----------------\n",
"C1 1-Byte Integers 3 33.3333 441 B 22.546\n",
"C1S 1-Byte Fractions 1 11.1111 163 B 8.33333\n",
"C2S 2-Byte Fractions 4 44.4444 968 B 49.4888\n",
"C4 4-Byte Integers 1 11.1111 384 B 19.6319"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Frame distribution summary: \n"
]
},
{
"data": {
"text/html": [
" | \n",
"size | \n",
"number_of_rows | \n",
"number_of_chunks_per_column | \n",
"number_of_chunks |
\n",
"| 172.16.2.43:54321 | \n",
" 1.9 KB | \n",
"79.0 | \n",
"1.0 | \n",
"9.0 |
\n",
"| mean | \n",
" 1.9 KB | \n",
"79.0 | \n",
"1.0 | \n",
"9.0 |
\n",
"| min | \n",
" 1.9 KB | \n",
"79.0 | \n",
"1.0 | \n",
"9.0 |
\n",
"| max | \n",
" 1.9 KB | \n",
"79.0 | \n",
"1.0 | \n",
"9.0 |
\n",
"| stddev | \n",
" 0 B | \n",
"0.0 | \n",
"0.0 | \n",
"0.0 |
\n",
"| total | \n",
" 1.9 KB | \n",
"79.0 | \n",
"1.0 | \n",
"9.0 |
"
],
"text/plain": [
" size number_of_rows number_of_chunks_per_column number_of_chunks\n",
"----------------- ------ ---------------- ----------------------------- ------------------\n",
"172.16.2.43:54321 1.9 KB 79 1 9\n",
"mean 1.9 KB 79 1 9\n",
"min 1.9 KB 79 1 9\n",
"max 1.9 KB 79 1 9\n",
"stddev 0 B 0 0 0\n",
"total 1.9 KB 79 1 9"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"| | Community Area Number | COMMUNITY AREA NAME | PERCENT OF HOUSING CROWDED | PERCENT HOUSEHOLDS BELOW POVERTY | PERCENT AGED 16 UNEMPLOYED | PERCENT AGED 25 WITHOUT HIGH SCHOOL DIPLOMA | PERCENT AGED UNDER 18 OR OVER 64 | PER CAPITA INCOME | HARDSHIP INDEX |
\n",
"| type | int | enum | real | real | real | real | real | int | int |
\n",
"| mins | 1.0 | 0.0 | 0.30000000000000004 | 3.3000000000000003 | 4.7 | 2.5 | 13.5 | 8201.0 | 1.0 |
\n",
"| mean | 39.0 | NaN | 4.920512820512822 | 21.73974358974359 | 15.341025641025642 | 20.33076923076924 | 35.71794871794871 | 25597.000000000004 | 49.506493506493506 |
\n",
"| maxs | 77.0 | 78.0 | 15.8 | 56.5 | 35.9 | 54.800000000000004 | 51.5 | 88669.0 | 98.0 |
\n",
"| sigma | 22.371857321197094 | NaN | 3.6589814413502006 | 11.457230912971083 | 7.49949670860991 | 11.746514351100048 | 7.284421084944952 | 15196.405541331917 | 28.69055565156158 |
\n",
"| zeros | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
\n",
"| missing | 2 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 2 |
\n",
"| 0 | nan | COMMUNITY AREA NAME | nan | nan | nan | nan | nan | nan | nan |
\n",
"| 1 | 1.0 | Rogers Park | 7.7 | 23.6 | 8.700000000000001 | 18.2 | 27.5 | 23939.0 | 39.0 |
\n",
"| 2 | 2.0 | West Ridge | 7.800000000000001 | 17.2 | 8.8 | 20.8 | 38.5 | 23040.0 | 46.0 |
\n",
"| 3 | 3.0 | Uptown | 3.8000000000000003 | 24.0 | 8.9 | 11.8 | 22.200000000000003 | 35787.0 | 20.0 |
\n",
"| 4 | 4.0 | Lincoln Square | 3.4000000000000004 | 10.9 | 8.200000000000001 | 13.4 | 25.5 | 37524.0 | 17.0 |
\n",
"| 5 | 5.0 | North Center | 0.30000000000000004 | 7.5 | 5.2 | 4.5 | 26.200000000000003 | 57123.0 | 6.0 |
\n",
"| 6 | 6.0 | Lake View | 1.1 | 11.4 | 4.7 | 2.6 | 17.0 | 60058.0 | 5.0 |
\n",
"| 7 | 7.0 | Lincoln Park | 0.8 | 12.3 | 5.1000000000000005 | 3.6 | 21.5 | 71551.0 | 2.0 |
\n",
"| 8 | 8.0 | Near North Side | 1.9000000000000001 | 12.9 | 7.0 | 2.5 | 22.6 | 88669.0 | 1.0 |
\n",
"| 9 | 9.0 | Edison Park | 1.1 | 3.3000000000000003 | 6.5 | 7.4 | 35.300000000000004 | 40959.0 | 8.0 |
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Import and Parse crimes data\n",
"\n",
"Parse Progress: [##################################################] 100%\n",
"Rows:9,999 Cols:22\n",
"\n",
"Chunk compression summary: \n"
]
},
{
"data": {
"text/html": [
"| chunk_type | \n",
"chunk_name | \n",
"count | \n",
"count_percentage | \n",
"size | \n",
"size_percentage |
\n",
"| C0L | \n",
"Constant Integers | \n",
"1 | \n",
"4.5454545 | \n",
" 80 B | \n",
"0.0092869 |
\n",
"| C1 | \n",
"1-Byte Integers | \n",
"8 | \n",
"36.363636 | \n",
" 78.6 KB | \n",
"9.349084 |
\n",
"| C1N | \n",
"1-Byte Integers (w/o NAs) | \n",
"2 | \n",
"9.090909 | \n",
" 19.7 KB | \n",
"2.337271 |
\n",
"| C2 | \n",
"2-Byte Integers | \n",
"4 | \n",
"18.181818 | \n",
" 78.4 KB | \n",
"9.317509 |
\n",
"| C4 | \n",
"4-Byte Integers | \n",
"3 | \n",
"13.636364 | \n",
" 117.4 KB | \n",
"13.952581 |
\n",
"| CStr | \n",
"String | \n",
"2 | \n",
"9.090909 | \n",
" 390.7 KB | \n",
"46.446617 |
\n",
"| C8D | \n",
"64-bit Reals | \n",
"2 | \n",
"9.090909 | \n",
" 156.4 KB | \n",
"18.587654 |
"
],
"text/plain": [
"chunk_type chunk_name count count_percentage size size_percentage\n",
"------------ ------------------------- ------- ------------------ -------- -----------------\n",
"C0L Constant Integers 1 4.54545 80 B 0.00928686\n",
"C1 1-Byte Integers 8 36.3636 78.6 KB 9.34908\n",
"C1N 1-Byte Integers (w/o NAs) 2 9.09091 19.7 KB 2.33727\n",
"C2 2-Byte Integers 4 18.1818 78.4 KB 9.31751\n",
"C4 4-Byte Integers 3 13.6364 117.4 KB 13.9526\n",
"CStr String 2 9.09091 390.7 KB 46.4466\n",
"C8D 64-bit Reals 2 9.09091 156.4 KB 18.5877"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Frame distribution summary: \n"
]
},
{
"data": {
"text/html": [
" | \n",
"size | \n",
"number_of_rows | \n",
"number_of_chunks_per_column | \n",
"number_of_chunks |
\n",
"| 172.16.2.43:54321 | \n",
" 841.2 KB | \n",
"9999.0 | \n",
"1.0 | \n",
"22.0 |
\n",
"| mean | \n",
" 841.2 KB | \n",
"9999.0 | \n",
"1.0 | \n",
"22.0 |
\n",
"| min | \n",
" 841.2 KB | \n",
"9999.0 | \n",
"1.0 | \n",
"22.0 |
\n",
"| max | \n",
" 841.2 KB | \n",
"9999.0 | \n",
"1.0 | \n",
"22.0 |
\n",
"| stddev | \n",
" 0 B | \n",
"0.0 | \n",
"0.0 | \n",
"0.0 |
\n",
"| total | \n",
" 841.2 KB | \n",
"9999.0 | \n",
"1.0 | \n",
"22.0 |
"
],
"text/plain": [
" size number_of_rows number_of_chunks_per_column number_of_chunks\n",
"----------------- -------- ---------------- ----------------------------- ------------------\n",
"172.16.2.43:54321 841.2 KB 9999 1 22\n",
"mean 841.2 KB 9999 1 22\n",
"min 841.2 KB 9999 1 22\n",
"max 841.2 KB 9999 1 22\n",
"stddev 0 B 0 0 0\n",
"total 841.2 KB 9999 1 22"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"| | ID | Case Number | Date | Block | IUCR | Primary Type | Description | Location Description | Arrest | Domestic | Beat | District | Ward | Community Area | FBI Code | X Coordinate | Y Coordinate | Year | Updated On | Latitude | Longitude | Location |
\n",
"| type | int | string | string | enum | int | enum | enum | enum | enum | enum | int | int | int | int | int | int | int | int | enum | real | real | enum |
\n",
"| mins | 21735.0 | NaN | NaN | 0.0 | 110.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 111.0 | 1.0 | 1.0 | 1.0 | 2.0 | 1100317.0 | 1814255.0 | 2015.0 | 0.0 | 41.64507243 | -87.906463888 | 0.0 |
\n",
"| mean | 9931318.737373699 | NaN | NaN | NaN | 1189.676513569939 | NaN | NaN | NaN | 0.29282928292829274 | 0.15231523152315235 | 1159.6180618061765 | 11.348988512757918 | 22.954095409541008 | 37.447644764476536 | 12.740123622682114 | 1163880.5981498407 | 1885916.1498424308 | 2015.0 | NaN | 41.842565224673535 | -87.67414052209607 | NaN |
\n",
"| maxs | 9962898.0 | NaN | NaN | 6517.0 | 5131.0 | 26.0 | 198.0 | 90.0 | 1.0 | 1.0 | 2535.0 | 25.0 | 50.0 | 77.0 | 26.0 | 1205069.0 | 1951533.0 | 2015.0 | 32.0 | 42.022646183 | -87.524773286 | 8603.0 |
\n",
"| sigma | 396787.5642214295 | NaN | NaN | NaN | 927.7514355826443 | NaN | NaN | NaN | 0.4550835155878833 | 0.3593441468595258 | 695.7602987498396 | 6.945474933012859 | 13.649566114361296 | 21.274876222320856 | 7.574238579108433 | 16496.449368147238 | 31274.01631985589 | 0.0 | NaN | 0.08601865793584824 | 0.06003579706529789 | NaN |
\n",
"| zeros | 0 | 0 | 0 | 3 | 0 | 11 | 933 | 19 | 7071 | 8476 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 603 | 0 | 0 | 1 |
\n",
"| missing | 0 | 0 | 0 | 0 | 419 | 0 | 0 | 6 | 0 | 0 | 0 | 162 | 0 | 0 | 2557 | 162 | 162 | 0 | 0 | 162 | 162 | 162 |
\n",
"| 0 | 9955810.0 | HY144797 | 02/08/2015 11:43:40 PM | 081XX S COLES AVE | 1811.0 | NARCOTICS | POSS: CANNABIS 30GMS OR LESS | STREET | true | false | 422.0 | 4.0 | 7.0 | 46.0 | 18.0 | 1198273.0 | 1851626.0 | 2015.0 | 02/15/2015 12:43:39 PM | 41.747693646 | -87.54903538900001 | (41.747693646, -87.549035389) |
\n",
"| 1 | 9955861.0 | HY144838 | 02/08/2015 11:41:42 PM | 118XX S STATE ST | 486.0 | BATTERY | DOMESTIC BATTERY SIMPLE | APARTMENT | true | true | 522.0 | 5.0 | 34.0 | 53.0 | nan | 1178335.0 | 1826581.0 | 2015.0 | 02/15/2015 12:43:39 PM | 41.679442289 | -87.622850758 | (41.679442289, -87.622850758) |
\n",
"| 2 | 9955801.0 | HY144779 | 02/08/2015 11:30:22 PM | 002XX S LARAMIE AVE | 2026.0 | NARCOTICS | POSS: PCP | SIDEWALK | true | false | 1522.0 | 15.0 | 29.0 | 25.0 | 18.0 | 1141717.0 | 1898581.0 | 2015.0 | 02/15/2015 12:43:39 PM | 41.877773330000004 | -87.755117993 | (41.87777333, -87.755117993) |
\n",
"| 3 | 9956197.0 | HY144787 | 02/08/2015 11:30:23 PM | 006XX E 67TH ST | 1811.0 | NARCOTICS | POSS: CANNABIS 30GMS OR LESS | STREET | true | false | 321.0 | nan | 6.0 | 42.0 | 18.0 | nan | nan | 2015.0 | 02/15/2015 12:43:39 PM | nan | nan | |
\n",
"| 4 | 9955846.0 | HY144829 | 02/08/2015 11:30:58 PM | 0000X S MAYFIELD AVE | 610.0 | BURGLARY | FORCIBLE ENTRY | APARTMENT | false | false | 1513.0 | 15.0 | 29.0 | 25.0 | 5.0 | 1137239.0 | 1899372.0 | 2015.0 | 02/15/2015 12:43:39 PM | 41.880025548000006 | -87.77154132400001 | (41.880025548, -87.771541324) |
\n",
"| 5 | 9955835.0 | HY144778 | 02/08/2015 11:30:21 PM | 010XX W 48TH ST | 486.0 | BATTERY | DOMESTIC BATTERY SIMPLE | APARTMENT | false | true | 933.0 | 9.0 | 3.0 | 61.0 | nan | 1169986.0 | 1873019.0 | 2015.0 | 02/15/2015 12:43:39 PM | 41.807059405000004 | -87.65206589 | (41.807059405, -87.65206589) |
\n",
"| 6 | 9955872.0 | HY144822 | 02/08/2015 11:27:24 PM | 015XX W ARTHUR AVE | 1320.0 | CRIMINAL DAMAGE | TO VEHICLE | STREET | false | false | 2432.0 | 24.0 | 40.0 | 1.0 | 14.0 | 1164732.0 | 1943222.0 | 2015.0 | 02/15/2015 12:43:39 PM | 41.999814056000005 | -87.669342967 | (41.999814056, -87.669342967) |
\n",
"| 7 | 21752.0 | HY144738 | 02/08/2015 11:26:12 PM | 060XX W GRAND AVE | 110.0 | HOMICIDE | FIRST DEGREE MURDER | STREET | true | false | 2512.0 | 25.0 | 37.0 | 19.0 | nan | 1135910.0 | 1914206.0 | 2015.0 | 02/15/2015 12:43:39 PM | 41.920755683 | -87.776067514 | (41.920755683, -87.776067514) |
\n",
"| 8 | 9955808.0 | HY144775 | 02/08/2015 11:20:33 PM | 001XX W WACKER DR | 460.0 | BATTERY | SIMPLE | OTHER | false | false | 122.0 | 1.0 | 42.0 | 32.0 | nan | 1175384.0 | 1902088.0 | 2015.0 | 02/15/2015 12:43:39 PM | 41.886707818000005 | -87.63139635600001 | (41.886707818, -87.631396356) |
\n",
"| 9 | 9958275.0 | HY146732 | 02/08/2015 11:15:36 PM | 001XX W WACKER DR | 460.0 | BATTERY | SIMPLE | HOTEL/MOTEL | false | false | 122.0 | 1.0 | 42.0 | 32.0 | nan | 1175384.0 | 1902088.0 | 2015.0 | 02/15/2015 12:43:39 PM | 41.886707818000005 | -87.63139635600001 | (41.886707818, -87.631396356) |
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.\n",
"weather_path = _locate(\"smalldata/chicago/chicagoAllWeather.csv\")\n",
"census_path = _locate(\"smalldata/chicago/chicagoCensus.csv\")\n",
"crimes_path = _locate(\"smalldata/chicago/chicagoCrimes10k.csv.zip\")\n",
"\n",
"print(\"Import and Parse weather data\")\n",
"weather = h2o.import_file(path=weather_path, col_types = [\"time\"] + [\"numeric\"]*6)\n",
"weather.drop(\"date\")\n",
"weather.describe()\n",
"\n",
"print(\"Import and Parse census data\")\n",
"census = h2o.import_file(path=census_path, col_types = [\"numeric\", \"enum\"] + [\"numeric\"]*7)\n",
"census.describe()\n",
"\n",
"print(\"Import and Parse crimes data\")\n",
"crimes = h2o.import_file(path=crimes_path)\n",
"crimes.describe()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Rows:9,999 Cols:27\n",
"\n",
"Chunk compression summary: \n"
]
},
{
"data": {
"text/html": [
"| chunk_type | \n",
"chunk_name | \n",
"count | \n",
"count_percentage | \n",
"size | \n",
"size_percentage |
\n",
"| C0L | \n",
"Constant Integers | \n",
"1 | \n",
"3.7037036 | \n",
" 80 B | \n",
"0.0110837 |
\n",
"| C1 | \n",
"1-Byte Integers | \n",
"8 | \n",
"29.62963 | \n",
" 78.6 KB | \n",
"11.157955 |
\n",
"| C1N | \n",
"1-Byte Integers (w/o NAs) | \n",
"7 | \n",
"25.925926 | \n",
" 68.8 KB | \n",
"9.763211 |
\n",
"| C2 | \n",
"2-Byte Integers | \n",
"4 | \n",
"14.814815 | \n",
" 78.4 KB | \n",
"11.12027 |
\n",
"| C4 | \n",
"4-Byte Integers | \n",
"3 | \n",
"11.111112 | \n",
" 117.4 KB | \n",
"16.652143 |
\n",
"| C8 | \n",
"64-bit Integers | \n",
"1 | \n",
"3.7037036 | \n",
" 78.2 KB | \n",
"11.092008 |
\n",
"| CStr | \n",
"String | \n",
"1 | \n",
"3.7037036 | \n",
" 127.0 KB | \n",
"18.019316 |
\n",
"| C8D | \n",
"64-bit Reals | \n",
"2 | \n",
"7.4074073 | \n",
" 156.4 KB | \n",
"22.184013 |
"
],
"text/plain": [
"chunk_type chunk_name count count_percentage size size_percentage\n",
"------------ ------------------------- ------- ------------------ -------- -----------------\n",
"C0L Constant Integers 1 3.7037 80 B 0.0110837\n",
"C1 1-Byte Integers 8 29.6296 78.6 KB 11.158\n",
"C1N 1-Byte Integers (w/o NAs) 7 25.9259 68.8 KB 9.76321\n",
"C2 2-Byte Integers 4 14.8148 78.4 KB 11.1203\n",
"C4 4-Byte Integers 3 11.1111 117.4 KB 16.6521\n",
"C8 64-bit Integers 1 3.7037 78.2 KB 11.092\n",
"CStr String 1 3.7037 127.0 KB 18.0193\n",
"C8D 64-bit Reals 2 7.40741 156.4 KB 22.184"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Frame distribution summary: \n"
]
},
{
"data": {
"text/html": [
" | \n",
"size | \n",
"number_of_rows | \n",
"number_of_chunks_per_column | \n",
"number_of_chunks |
\n",
"| 172.16.2.43:54321 | \n",
" 704.9 KB | \n",
"9999.0 | \n",
"1.0 | \n",
"27.0 |
\n",
"| mean | \n",
" 704.9 KB | \n",
"9999.0 | \n",
"1.0 | \n",
"27.0 |
\n",
"| min | \n",
" 704.9 KB | \n",
"9999.0 | \n",
"1.0 | \n",
"27.0 |
\n",
"| max | \n",
" 704.9 KB | \n",
"9999.0 | \n",
"1.0 | \n",
"27.0 |
\n",
"| stddev | \n",
" 0 B | \n",
"0.0 | \n",
"0.0 | \n",
"0.0 |
\n",
"| total | \n",
" 704.9 KB | \n",
"9999.0 | \n",
"1.0 | \n",
"27.0 |
"
],
"text/plain": [
" size number_of_rows number_of_chunks_per_column number_of_chunks\n",
"----------------- -------- ---------------- ----------------------------- ------------------\n",
"172.16.2.43:54321 704.9 KB 9999 1 27\n",
"mean 704.9 KB 9999 1 27\n",
"min 704.9 KB 9999 1 27\n",
"max 704.9 KB 9999 1 27\n",
"stddev 0 B 0 0 0\n",
"total 704.9 KB 9999 1 27"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"| | ID | Case Number | Date | Block | IUCR | Primary Type | Description | Location Description | Arrest | Domestic | Beat | District | Ward | Community Area | FBI Code | X Coordinate | Y Coordinate | Year | Updated On | Latitude | Longitude | Location | Day | Month | WeekNum | WeekDay | HourOfDay |
\n",
"| type | int | string | int | enum | int | enum | enum | enum | enum | enum | int | int | int | int | int | int | int | int | enum | real | real | enum | int | int | int | enum | int |
\n",
"| mins | 21735.0 | NaN | 1422030630000.0 | 0.0 | 110.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 111.0 | 1.0 | 1.0 | 1.0 | 2.0 | 1100317.0 | 1814255.0 | 3915.0 | 0.0 | 41.64507243 | -87.906463888 | 0.0 | 1.0 | 2.0 | 4.0 | 0.0 | 0.0 |
\n",
"| mean | 9931318.737373699 | NaN | 1422714450809.2847 | NaN | 1189.676513569939 | NaN | NaN | NaN | 0.29282928292829274 | 0.15231523152315235 | 1159.6180618061765 | 11.348988512757918 | 22.954095409541008 | 37.447644764476536 | 12.740123622682114 | 1163880.5981498407 | 1885916.1498424308 | 3915.0 | NaN | 41.842565224673535 | -87.67414052209607 | NaN | 17.683968396839663 | 2.419441944194423 | 5.1808180818082 | NaN | 13.631963196319662 |
\n",
"| maxs | 9962898.0 | NaN | 1423467820000.0 | 6517.0 | 5131.0 | 26.0 | 198.0 | 90.0 | 1.0 | 1.0 | 2535.0 | 25.0 | 50.0 | 77.0 | 26.0 | 1205069.0 | 1951533.0 | 3915.0 | 32.0 | 42.022646183 | -87.524773286 | 8603.0 | 31.0 | 3.0 | 6.0 | 6.0 | 23.0 |
\n",
"| sigma | 396787.5642214295 | NaN | 433879245.1905283 | NaN | 927.7514355826443 | NaN | NaN | NaN | 0.4550835155878833 | 0.3593441468595258 | 695.7602987498396 | 6.945474933012859 | 13.649566114361296 | 21.274876222320856 | 7.574238579108433 | 16496.449368147238 | 31274.01631985589 | 0.0 | NaN | 0.08601865793584824 | 0.06003579706529789 | NaN | 11.180104335827702 | 0.4934924067865386 | 0.7389298304087689 | NaN | 6.4732173580715475 |
\n",
"| zeros | 0 | 0 | 0 | 3 | 0 | 11 | 933 | 19 | 7071 | 8476 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 603 | 0 | 0 | 1 | 0 | 0 | 0 | 1038 | 374 |
\n",
"| missing | 0 | 0 | 0 | 0 | 419 | 0 | 0 | 6 | 0 | 0 | 0 | 162 | 0 | 0 | 2557 | 162 | 162 | 0 | 0 | 162 | 162 | 162 | 0 | 0 | 0 | 0 | 0 |
\n",
"| 0 | 9955810.0 | HY144797 | 1423467820000.0 | 081XX S COLES AVE | 1811.0 | NARCOTICS | POSS: CANNABIS 30GMS OR LESS | STREET | true | false | 422.0 | 4.0 | 7.0 | 46.0 | 18.0 | 1198273.0 | 1851626.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.747693646 | -87.54903538900001 | (41.747693646, -87.549035389) | 8.0 | 3.0 | 6.0 | Sun | 23.0 |
\n",
"| 1 | 9955861.0 | HY144838 | 1423467702000.0 | 118XX S STATE ST | 486.0 | BATTERY | DOMESTIC BATTERY SIMPLE | APARTMENT | true | true | 522.0 | 5.0 | 34.0 | 53.0 | nan | 1178335.0 | 1826581.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.679442289 | -87.622850758 | (41.679442289, -87.622850758) | 8.0 | 3.0 | 6.0 | Sun | 23.0 |
\n",
"| 2 | 9955801.0 | HY144779 | 1423467022000.0 | 002XX S LARAMIE AVE | 2026.0 | NARCOTICS | POSS: PCP | SIDEWALK | true | false | 1522.0 | 15.0 | 29.0 | 25.0 | 18.0 | 1141717.0 | 1898581.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.877773330000004 | -87.755117993 | (41.87777333, -87.755117993) | 8.0 | 3.0 | 6.0 | Sun | 23.0 |
\n",
"| 3 | 9956197.0 | HY144787 | 1423467023000.0 | 006XX E 67TH ST | 1811.0 | NARCOTICS | POSS: CANNABIS 30GMS OR LESS | STREET | true | false | 321.0 | nan | 6.0 | 42.0 | 18.0 | nan | nan | 3915.0 | 02/15/2015 12:43:39 PM | nan | nan | | 8.0 | 3.0 | 6.0 | Sun | 23.0 |
\n",
"| 4 | 9955846.0 | HY144829 | 1423467058000.0 | 0000X S MAYFIELD AVE | 610.0 | BURGLARY | FORCIBLE ENTRY | APARTMENT | false | false | 1513.0 | 15.0 | 29.0 | 25.0 | 5.0 | 1137239.0 | 1899372.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.880025548000006 | -87.77154132400001 | (41.880025548, -87.771541324) | 8.0 | 3.0 | 6.0 | Sun | 23.0 |
\n",
"| 5 | 9955835.0 | HY144778 | 1423467021000.0 | 010XX W 48TH ST | 486.0 | BATTERY | DOMESTIC BATTERY SIMPLE | APARTMENT | false | true | 933.0 | 9.0 | 3.0 | 61.0 | nan | 1169986.0 | 1873019.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.807059405000004 | -87.65206589 | (41.807059405, -87.65206589) | 8.0 | 3.0 | 6.0 | Sun | 23.0 |
\n",
"| 6 | 9955872.0 | HY144822 | 1423466844000.0 | 015XX W ARTHUR AVE | 1320.0 | CRIMINAL DAMAGE | TO VEHICLE | STREET | false | false | 2432.0 | 24.0 | 40.0 | 1.0 | 14.0 | 1164732.0 | 1943222.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.999814056000005 | -87.669342967 | (41.999814056, -87.669342967) | 8.0 | 3.0 | 6.0 | Sun | 23.0 |
\n",
"| 7 | 21752.0 | HY144738 | 1423466772000.0 | 060XX W GRAND AVE | 110.0 | HOMICIDE | FIRST DEGREE MURDER | STREET | true | false | 2512.0 | 25.0 | 37.0 | 19.0 | nan | 1135910.0 | 1914206.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.920755683 | -87.776067514 | (41.920755683, -87.776067514) | 8.0 | 3.0 | 6.0 | Sun | 23.0 |
\n",
"| 8 | 9955808.0 | HY144775 | 1423466433000.0 | 001XX W WACKER DR | 460.0 | BATTERY | SIMPLE | OTHER | false | false | 122.0 | 1.0 | 42.0 | 32.0 | nan | 1175384.0 | 1902088.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.886707818000005 | -87.63139635600001 | (41.886707818, -87.631396356) | 8.0 | 3.0 | 6.0 | Sun | 23.0 |
\n",
"| 9 | 9958275.0 | HY146732 | 1423466136000.0 | 001XX W WACKER DR | 460.0 | BATTERY | SIMPLE | HOTEL/MOTEL | false | false | 122.0 | 1.0 | 42.0 | 32.0 | nan | 1175384.0 | 1902088.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.886707818000005 | -87.63139635600001 | (41.886707818, -87.631396356) | 8.0 | 3.0 | 6.0 | Sun | 23.0 |
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Rows:9,999 Cols:28\n",
"\n",
"Chunk compression summary: \n"
]
},
{
"data": {
"text/html": [
"| chunk_type | \n",
"chunk_name | \n",
"count | \n",
"count_percentage | \n",
"size | \n",
"size_percentage |
\n",
"| C0L | \n",
"Constant Integers | \n",
"1 | \n",
"3.5714288 | \n",
" 80 B | \n",
"0.0124154 |
\n",
"| CBS | \n",
"Bits | \n",
"2 | \n",
"7.1428576 | \n",
" 2.6 KB | \n",
"0.4097082 |
\n",
"| C1 | \n",
"1-Byte Integers | \n",
"8 | \n",
"28.57143 | \n",
" 78.6 KB | \n",
"12.498584 |
\n",
"| C1N | \n",
"1-Byte Integers (w/o NAs) | \n",
"7 | \n",
"25.0 | \n",
" 68.8 KB | \n",
"10.936261 |
\n",
"| C2 | \n",
"2-Byte Integers | \n",
"4 | \n",
"14.285715 | \n",
" 78.4 KB | \n",
"12.456371 |
\n",
"| C4 | \n",
"4-Byte Integers | \n",
"3 | \n",
"10.714286 | \n",
" 117.4 KB | \n",
"18.652899 |
\n",
"| CStr | \n",
"String | \n",
"1 | \n",
"3.5714288 | \n",
" 127.0 KB | \n",
"20.184338 |
\n",
"| C8D | \n",
"64-bit Reals | \n",
"2 | \n",
"7.1428576 | \n",
" 156.4 KB | \n",
"24.849424 |
"
],
"text/plain": [
"chunk_type chunk_name count count_percentage size size_percentage\n",
"------------ ------------------------- ------- ------------------ -------- -----------------\n",
"C0L Constant Integers 1 3.57143 80 B 0.0124154\n",
"CBS Bits 2 7.14286 2.6 KB 0.409708\n",
"C1 1-Byte Integers 8 28.5714 78.6 KB 12.4986\n",
"C1N 1-Byte Integers (w/o NAs) 7 25 68.8 KB 10.9363\n",
"C2 2-Byte Integers 4 14.2857 78.4 KB 12.4564\n",
"C4 4-Byte Integers 3 10.7143 117.4 KB 18.6529\n",
"CStr String 1 3.57143 127.0 KB 20.1843\n",
"C8D 64-bit Reals 2 7.14286 156.4 KB 24.8494"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Frame distribution summary: \n"
]
},
{
"data": {
"text/html": [
" | \n",
"size | \n",
"number_of_rows | \n",
"number_of_chunks_per_column | \n",
"number_of_chunks |
\n",
"| 172.16.2.43:54321 | \n",
" 629.3 KB | \n",
"9999.0 | \n",
"1.0 | \n",
"28.0 |
\n",
"| mean | \n",
" 629.3 KB | \n",
"9999.0 | \n",
"1.0 | \n",
"28.0 |
\n",
"| min | \n",
" 629.3 KB | \n",
"9999.0 | \n",
"1.0 | \n",
"28.0 |
\n",
"| max | \n",
" 629.3 KB | \n",
"9999.0 | \n",
"1.0 | \n",
"28.0 |
\n",
"| stddev | \n",
" 0 B | \n",
"0.0 | \n",
"0.0 | \n",
"0.0 |
\n",
"| total | \n",
" 629.3 KB | \n",
"9999.0 | \n",
"1.0 | \n",
"28.0 |
"
],
"text/plain": [
" size number_of_rows number_of_chunks_per_column number_of_chunks\n",
"----------------- -------- ---------------- ----------------------------- ------------------\n",
"172.16.2.43:54321 629.3 KB 9999 1 28\n",
"mean 629.3 KB 9999 1 28\n",
"min 629.3 KB 9999 1 28\n",
"max 629.3 KB 9999 1 28\n",
"stddev 0 B 0 0 0\n",
"total 629.3 KB 9999 1 28"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"| | ID | Case Number | Block | IUCR | Primary Type | Description | Location Description | Arrest | Domestic | Beat | District | Ward | Community Area | FBI Code | X Coordinate | Y Coordinate | Year | Updated On | Latitude | Longitude | Location | Day | Month | WeekNum | WeekDay | HourOfDay | Weekend | Season |
\n",
"| type | int | string | enum | int | enum | enum | enum | enum | enum | int | int | int | int | int | int | int | int | enum | real | real | enum | int | int | int | enum | int | int | enum |
\n",
"| mins | 21735.0 | NaN | 0.0 | 110.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 111.0 | 1.0 | 1.0 | 1.0 | 2.0 | 1100317.0 | 1814255.0 | 3915.0 | 0.0 | 41.64507243 | -87.906463888 | 0.0 | 1.0 | 2.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 |
\n",
"| mean | 9931318.737373699 | NaN | NaN | 1189.676513569939 | NaN | NaN | NaN | 0.29282928292829274 | 0.15231523152315235 | 1159.6180618061765 | 11.348988512757918 | 22.954095409541008 | 37.447644764476536 | 12.740123622682114 | 1163880.5981498407 | 1885916.1498424308 | 3915.0 | NaN | 41.842565224673535 | -87.67414052209607 | NaN | 17.683968396839663 | 2.419441944194423 | 5.1808180818082 | NaN | 13.631963196319662 | 0.35753575357535755 | NaN |
\n",
"| maxs | 9962898.0 | NaN | 6517.0 | 5131.0 | 26.0 | 198.0 | 90.0 | 1.0 | 1.0 | 2535.0 | 25.0 | 50.0 | 77.0 | 26.0 | 1205069.0 | 1951533.0 | 3915.0 | 32.0 | 42.022646183 | -87.524773286 | 8603.0 | 31.0 | 3.0 | 6.0 | 6.0 | 23.0 | 1.0 | 1.0 |
\n",
"| sigma | 396787.5642214295 | NaN | NaN | 927.7514355826443 | NaN | NaN | NaN | 0.4550835155878833 | 0.3593441468595258 | 695.7602987498396 | 6.945474933012859 | 13.649566114361296 | 21.274876222320856 | 7.574238579108433 | 16496.449368147238 | 31274.01631985589 | 0.0 | NaN | 0.08601865793584824 | 0.06003579706529789 | NaN | 11.180104335827702 | 0.4934924067865386 | 0.7389298304087689 | NaN | 6.4732173580715475 | 0.47929835538994453 | NaN |
\n",
"| zeros | 0 | 0 | 3 | 0 | 11 | 933 | 19 | 7071 | 8476 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 603 | 0 | 0 | 1 | 0 | 0 | 0 | 1038 | 374 | 6424 | 5805 |
\n",
"| missing | 0 | 0 | 0 | 419 | 0 | 0 | 6 | 0 | 0 | 0 | 162 | 0 | 0 | 2557 | 162 | 162 | 0 | 0 | 162 | 162 | 162 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
\n",
"| 0 | 9955810.0 | HY144797 | 081XX S COLES AVE | 1811.0 | NARCOTICS | POSS: CANNABIS 30GMS OR LESS | STREET | true | false | 422.0 | 4.0 | 7.0 | 46.0 | 18.0 | 1198273.0 | 1851626.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.747693646 | -87.54903538900001 | (41.747693646, -87.549035389) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"| 1 | 9955861.0 | HY144838 | 118XX S STATE ST | 486.0 | BATTERY | DOMESTIC BATTERY SIMPLE | APARTMENT | true | true | 522.0 | 5.0 | 34.0 | 53.0 | nan | 1178335.0 | 1826581.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.679442289 | -87.622850758 | (41.679442289, -87.622850758) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"| 2 | 9955801.0 | HY144779 | 002XX S LARAMIE AVE | 2026.0 | NARCOTICS | POSS: PCP | SIDEWALK | true | false | 1522.0 | 15.0 | 29.0 | 25.0 | 18.0 | 1141717.0 | 1898581.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.877773330000004 | -87.755117993 | (41.87777333, -87.755117993) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"| 3 | 9956197.0 | HY144787 | 006XX E 67TH ST | 1811.0 | NARCOTICS | POSS: CANNABIS 30GMS OR LESS | STREET | true | false | 321.0 | nan | 6.0 | 42.0 | 18.0 | nan | nan | 3915.0 | 02/15/2015 12:43:39 PM | nan | nan | | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"| 4 | 9955846.0 | HY144829 | 0000X S MAYFIELD AVE | 610.0 | BURGLARY | FORCIBLE ENTRY | APARTMENT | false | false | 1513.0 | 15.0 | 29.0 | 25.0 | 5.0 | 1137239.0 | 1899372.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.880025548000006 | -87.77154132400001 | (41.880025548, -87.771541324) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"| 5 | 9955835.0 | HY144778 | 010XX W 48TH ST | 486.0 | BATTERY | DOMESTIC BATTERY SIMPLE | APARTMENT | false | true | 933.0 | 9.0 | 3.0 | 61.0 | nan | 1169986.0 | 1873019.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.807059405000004 | -87.65206589 | (41.807059405, -87.65206589) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"| 6 | 9955872.0 | HY144822 | 015XX W ARTHUR AVE | 1320.0 | CRIMINAL DAMAGE | TO VEHICLE | STREET | false | false | 2432.0 | 24.0 | 40.0 | 1.0 | 14.0 | 1164732.0 | 1943222.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.999814056000005 | -87.669342967 | (41.999814056, -87.669342967) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"| 7 | 21752.0 | HY144738 | 060XX W GRAND AVE | 110.0 | HOMICIDE | FIRST DEGREE MURDER | STREET | true | false | 2512.0 | 25.0 | 37.0 | 19.0 | nan | 1135910.0 | 1914206.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.920755683 | -87.776067514 | (41.920755683, -87.776067514) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"| 8 | 9955808.0 | HY144775 | 001XX W WACKER DR | 460.0 | BATTERY | SIMPLE | OTHER | false | false | 122.0 | 1.0 | 42.0 | 32.0 | nan | 1175384.0 | 1902088.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.886707818000005 | -87.63139635600001 | (41.886707818, -87.631396356) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"| 9 | 9958275.0 | HY146732 | 001XX W WACKER DR | 460.0 | BATTERY | SIMPLE | HOTEL/MOTEL | false | false | 122.0 | 1.0 | 42.0 | 32.0 | nan | 1175384.0 | 1902088.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.886707818000005 | -87.63139635600001 | (41.886707818, -87.631396356) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def refine_date_col(data, col, pattern):\n",
" #data[col] = data[col].as_date(pattern) # As of 5/29/2106 H2O defaults parse as a date\n",
" data[\"Day\"] = data[col].day()\n",
" data[\"Month\"] = data[col].month() + 1 # Since H2O indexes from 0\n",
" data[\"Year\"] = data[col].year() + 1900 # Start of epoch is 1900\n",
" data[\"WeekNum\"] = data[col].week()\n",
" data[\"WeekDay\"] = data[col].dayOfWeek()\n",
" data[\"HourOfDay\"] = data[col].hour()\n",
" \n",
" data.describe() # HACK: Force evaluation before ifelse and cut. See PUBDEV-1425.\n",
" \n",
" # Create weekend and season cols\n",
" # Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.\n",
" # data[\"Weekend\"] = [1 if x in (\"Sun\", \"Sat\") else 0 for x in data[\"WeekDay\"]]\n",
" data[\"Weekend\"] = ((data[\"WeekDay\"] == \"Sun\") | (data[\"WeekDay\"] == \"Sat\"))\n",
" data[\"Season\"] = data[\"Month\"].cut([0, 2, 5, 7, 10, 12], [\"Winter\", \"Spring\", \"Summer\", \"Autumn\", \"Winter\"])\n",
" \n",
"refine_date_col(crimes, \"Date\", \"%m/%d/%Y %I:%M:%S %p\")\n",
"crimes = crimes.drop(\"Date\")\n",
"crimes.describe()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Rows:9,999 Cols:28\n",
"\n",
"Chunk compression summary: \n"
]
},
{
"data": {
"text/html": [
"| chunk_type | \n",
"chunk_name | \n",
"count | \n",
"count_percentage | \n",
"size | \n",
"size_percentage |
\n",
"| C0L | \n",
"Constant Integers | \n",
"1 | \n",
"3.5714288 | \n",
" 80 B | \n",
"0.0124154 |
\n",
"| CBS | \n",
"Bits | \n",
"2 | \n",
"7.1428576 | \n",
" 2.6 KB | \n",
"0.4097082 |
\n",
"| C1 | \n",
"1-Byte Integers | \n",
"8 | \n",
"28.57143 | \n",
" 78.6 KB | \n",
"12.498584 |
\n",
"| C1N | \n",
"1-Byte Integers (w/o NAs) | \n",
"7 | \n",
"25.0 | \n",
" 68.8 KB | \n",
"10.936261 |
\n",
"| C2 | \n",
"2-Byte Integers | \n",
"4 | \n",
"14.285715 | \n",
" 78.4 KB | \n",
"12.456371 |
\n",
"| C4 | \n",
"4-Byte Integers | \n",
"3 | \n",
"10.714286 | \n",
" 117.4 KB | \n",
"18.652899 |
\n",
"| CStr | \n",
"String | \n",
"1 | \n",
"3.5714288 | \n",
" 127.0 KB | \n",
"20.184338 |
\n",
"| C8D | \n",
"64-bit Reals | \n",
"2 | \n",
"7.1428576 | \n",
" 156.4 KB | \n",
"24.849424 |
"
],
"text/plain": [
"chunk_type chunk_name count count_percentage size size_percentage\n",
"------------ ------------------------- ------- ------------------ -------- -----------------\n",
"C0L Constant Integers 1 3.57143 80 B 0.0124154\n",
"CBS Bits 2 7.14286 2.6 KB 0.409708\n",
"C1 1-Byte Integers 8 28.5714 78.6 KB 12.4986\n",
"C1N 1-Byte Integers (w/o NAs) 7 25 68.8 KB 10.9363\n",
"C2 2-Byte Integers 4 14.2857 78.4 KB 12.4564\n",
"C4 4-Byte Integers 3 10.7143 117.4 KB 18.6529\n",
"CStr String 1 3.57143 127.0 KB 20.1843\n",
"C8D 64-bit Reals 2 7.14286 156.4 KB 24.8494"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Frame distribution summary: \n"
]
},
{
"data": {
"text/html": [
" | \n",
"size | \n",
"number_of_rows | \n",
"number_of_chunks_per_column | \n",
"number_of_chunks |
\n",
"| 172.16.2.43:54321 | \n",
" 629.3 KB | \n",
"9999.0 | \n",
"1.0 | \n",
"28.0 |
\n",
"| mean | \n",
" 629.3 KB | \n",
"9999.0 | \n",
"1.0 | \n",
"28.0 |
\n",
"| min | \n",
" 629.3 KB | \n",
"9999.0 | \n",
"1.0 | \n",
"28.0 |
\n",
"| max | \n",
" 629.3 KB | \n",
"9999.0 | \n",
"1.0 | \n",
"28.0 |
\n",
"| stddev | \n",
" 0 B | \n",
"0.0 | \n",
"0.0 | \n",
"0.0 |
\n",
"| total | \n",
" 629.3 KB | \n",
"9999.0 | \n",
"1.0 | \n",
"28.0 |
"
],
"text/plain": [
" size number_of_rows number_of_chunks_per_column number_of_chunks\n",
"----------------- -------- ---------------- ----------------------------- ------------------\n",
"172.16.2.43:54321 629.3 KB 9999 1 28\n",
"mean 629.3 KB 9999 1 28\n",
"min 629.3 KB 9999 1 28\n",
"max 629.3 KB 9999 1 28\n",
"stddev 0 B 0 0 0\n",
"total 629.3 KB 9999 1 28"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"| | ID | Case Number | Block | IUCR | Primary Type | Description | Location Description | Arrest | Domestic | Beat | District | Ward | Community Area | FBI Code | X Coordinate | Y Coordinate | Year | Updated On | Latitude | Longitude | Location | Day | Month | WeekNum | WeekDay | HourOfDay | Weekend | Season |
\n",
"| type | int | string | enum | int | enum | enum | enum | enum | enum | int | int | int | int | int | int | int | int | enum | real | real | enum | int | int | int | enum | int | int | enum |
\n",
"| mins | 21735.0 | NaN | 0.0 | 110.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 111.0 | 1.0 | 1.0 | 1.0 | 2.0 | 1100317.0 | 1814255.0 | 3915.0 | 0.0 | 41.64507243 | -87.906463888 | 0.0 | 1.0 | 2.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 |
\n",
"| mean | 9931318.737373699 | NaN | NaN | 1189.676513569939 | NaN | NaN | NaN | 0.29282928292829274 | 0.15231523152315235 | 1159.6180618061765 | 11.348988512757918 | 22.954095409541008 | 37.447644764476536 | 12.740123622682114 | 1163880.5981498407 | 1885916.1498424308 | 3915.0 | NaN | 41.842565224673535 | -87.67414052209607 | NaN | 17.683968396839663 | 2.419441944194423 | 5.1808180818082 | NaN | 13.631963196319662 | 0.35753575357535755 | NaN |
\n",
"| maxs | 9962898.0 | NaN | 6517.0 | 5131.0 | 26.0 | 198.0 | 90.0 | 1.0 | 1.0 | 2535.0 | 25.0 | 50.0 | 77.0 | 26.0 | 1205069.0 | 1951533.0 | 3915.0 | 32.0 | 42.022646183 | -87.524773286 | 8603.0 | 31.0 | 3.0 | 6.0 | 6.0 | 23.0 | 1.0 | 1.0 |
\n",
"| sigma | 396787.5642214295 | NaN | NaN | 927.7514355826443 | NaN | NaN | NaN | 0.4550835155878833 | 0.3593441468595258 | 695.7602987498396 | 6.945474933012859 | 13.649566114361296 | 21.274876222320856 | 7.574238579108433 | 16496.449368147238 | 31274.01631985589 | 0.0 | NaN | 0.08601865793584824 | 0.06003579706529789 | NaN | 11.180104335827702 | 0.4934924067865386 | 0.7389298304087689 | NaN | 6.4732173580715475 | 0.47929835538994453 | NaN |
\n",
"| zeros | 0 | 0 | 3 | 0 | 11 | 933 | 19 | 7071 | 8476 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 603 | 0 | 0 | 1 | 0 | 0 | 0 | 1038 | 374 | 6424 | 5805 |
\n",
"| missing | 0 | 0 | 0 | 419 | 0 | 0 | 6 | 0 | 0 | 0 | 162 | 0 | 0 | 2557 | 162 | 162 | 0 | 0 | 162 | 162 | 162 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
\n",
"| 0 | 9955810.0 | HY144797 | 081XX S COLES AVE | 1811.0 | NARCOTICS | POSS: CANNABIS 30GMS OR LESS | STREET | true | false | 422.0 | 4.0 | 7.0 | 46.0 | 18.0 | 1198273.0 | 1851626.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.747693646 | -87.54903538900001 | (41.747693646, -87.549035389) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"| 1 | 9955861.0 | HY144838 | 118XX S STATE ST | 486.0 | BATTERY | DOMESTIC BATTERY SIMPLE | APARTMENT | true | true | 522.0 | 5.0 | 34.0 | 53.0 | nan | 1178335.0 | 1826581.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.679442289 | -87.622850758 | (41.679442289, -87.622850758) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"| 2 | 9955801.0 | HY144779 | 002XX S LARAMIE AVE | 2026.0 | NARCOTICS | POSS: PCP | SIDEWALK | true | false | 1522.0 | 15.0 | 29.0 | 25.0 | 18.0 | 1141717.0 | 1898581.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.877773330000004 | -87.755117993 | (41.87777333, -87.755117993) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"| 3 | 9956197.0 | HY144787 | 006XX E 67TH ST | 1811.0 | NARCOTICS | POSS: CANNABIS 30GMS OR LESS | STREET | true | false | 321.0 | nan | 6.0 | 42.0 | 18.0 | nan | nan | 3915.0 | 02/15/2015 12:43:39 PM | nan | nan | | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"| 4 | 9955846.0 | HY144829 | 0000X S MAYFIELD AVE | 610.0 | BURGLARY | FORCIBLE ENTRY | APARTMENT | false | false | 1513.0 | 15.0 | 29.0 | 25.0 | 5.0 | 1137239.0 | 1899372.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.880025548000006 | -87.77154132400001 | (41.880025548, -87.771541324) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"| 5 | 9955835.0 | HY144778 | 010XX W 48TH ST | 486.0 | BATTERY | DOMESTIC BATTERY SIMPLE | APARTMENT | false | true | 933.0 | 9.0 | 3.0 | 61.0 | nan | 1169986.0 | 1873019.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.807059405000004 | -87.65206589 | (41.807059405, -87.65206589) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"| 6 | 9955872.0 | HY144822 | 015XX W ARTHUR AVE | 1320.0 | CRIMINAL DAMAGE | TO VEHICLE | STREET | false | false | 2432.0 | 24.0 | 40.0 | 1.0 | 14.0 | 1164732.0 | 1943222.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.999814056000005 | -87.669342967 | (41.999814056, -87.669342967) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"| 7 | 21752.0 | HY144738 | 060XX W GRAND AVE | 110.0 | HOMICIDE | FIRST DEGREE MURDER | STREET | true | false | 2512.0 | 25.0 | 37.0 | 19.0 | nan | 1135910.0 | 1914206.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.920755683 | -87.776067514 | (41.920755683, -87.776067514) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"| 8 | 9955808.0 | HY144775 | 001XX W WACKER DR | 460.0 | BATTERY | SIMPLE | OTHER | false | false | 122.0 | 1.0 | 42.0 | 32.0 | nan | 1175384.0 | 1902088.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.886707818000005 | -87.63139635600001 | (41.886707818, -87.631396356) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"| 9 | 9958275.0 | HY146732 | 001XX W WACKER DR | 460.0 | BATTERY | SIMPLE | HOTEL/MOTEL | false | false | 122.0 | 1.0 | 42.0 | 32.0 | nan | 1175384.0 | 1902088.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.886707818000005 | -87.63139635600001 | (41.886707818, -87.631396356) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Merge crimes data with weather and census\n",
"census.set_name(0,\"Community Area\")\n",
"weather.set_name(1,\"Month\")\n",
"weather.set_name(2,\"Day\")\n",
"weather.set_name(3,\"Year\")\n",
"crimes.merge(census, all_x=True, all_y=False)\n",
"crimes.merge(weather, all_x=True, all_y=False)\n",
"crimes.describe()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"gbm Model Build Progress: [##################################################] 100%\n",
"\n",
"deeplearning Model Build Progress: [##################################################] 100%\n"
]
}
],
"source": [
"# Create test/train split\n",
"r = crimes[\"Arrest\"].runif(1234)\n",
"train = crimes[r < 0.8]\n",
"test = crimes[r >= 0.8]\n",
"\n",
"# Simple GBM - Predict Arrest\n",
"crimes_names_x = crimes.names[:]\n",
"crimes_names_x.remove(\"Arrest\")\n",
"data_gbm = H2OGradientBoostingEstimator(ntrees =10,\n",
" max_depth =6,\n",
" distribution =\"bernoulli\")\n",
"\n",
"data_gbm.train(x =crimes_names_x,\n",
" y =\"Arrest\",\n",
" training_frame =train,\n",
" validation_frame=test)\n",
"\n",
"# Simple Deep Learning - Predict Arrest\n",
"# data_dl = H2ODeepLearningEstimator(variable_importances=True,\n",
"# loss =\"Automatic\")\n",
"\n",
"# data_dl.train(x =crimes_names_x,\n",
"# y =\"Arrest\",\n",
"# training_frame =train,\n",
"# validation_frame=test)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"| Model | \n",
"AUC Train | \n",
"AUC Test |
\n",
"| GBM | \n",
"0.9568221 | \n",
"0.9307979 |
\n",
"| DL | \n",
"0.8956055 | \n",
"0.8841564 |
"
],
"text/plain": [
"Model AUC Train AUC Test\n",
"------- ----------- ----------\n",
"GBM 0.956822 0.930798\n",
"DL 0.895605 0.884156"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"| Model | \n",
"AUC Train | \n",
"AUC Test |
\n",
"| GBM | \n",
"0.9568221 | \n",
"0.9307979 |
\n",
"| DL | \n",
"0.8956055 | \n",
"0.8841564 |
"
],
"text/plain": []
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# GBM performance on train/test data\n",
"train_auc_gbm = data_gbm.model_performance(train).auc()\n",
"test_auc_gbm = data_gbm.model_performance(test) .auc()\n",
"\n",
"# Deep Learning performance on train/test data\n",
"# train_auc_dl = data_dl.model_performance(train).auc()\n",
"# test_auc_dl = data_dl.model_performance(test) .auc()\n",
"\n",
"# Make a pretty HTML table printout of the results\n",
"header = [\"Model\", \"AUC Train\", \"AUC Test\"]\n",
"table = [\n",
" [\"GBM\", train_auc_gbm, test_auc_gbm],\n",
"# [\"DL \", train_auc_dl, test_auc_dl]\n",
" ]\n",
"h2o.display.H2OTableDisplay(table, columns_labels=header)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Parse Progress: [##################################################] 100%\n",
"Rows:2 Cols:16\n",
"\n",
"Chunk compression summary: \n"
]
},
{
"data": {
"text/html": [
"| chunk_type | \n",
"chunk_name | \n",
"count | \n",
"count_percentage | \n",
"size | \n",
"size_percentage |
\n",
"| C0L | \n",
"Constant Integers | \n",
"7 | \n",
"43.75 | \n",
" 560 B | \n",
"43.75 |
\n",
"| C1N | \n",
"1-Byte Integers (w/o NAs) | \n",
"4 | \n",
"25.0 | \n",
" 280 B | \n",
"21.875 |
\n",
"| C2 | \n",
"2-Byte Integers | \n",
"2 | \n",
"12.5 | \n",
" 144 B | \n",
"11.25 |
\n",
"| C2S | \n",
"2-Byte Fractions | \n",
"1 | \n",
"6.25 | \n",
" 88 B | \n",
"6.875 |
\n",
"| CStr | \n",
"String | \n",
"2 | \n",
"12.5 | \n",
" 208 B | \n",
"16.25 |
"
],
"text/plain": [
"chunk_type chunk_name count count_percentage size size_percentage\n",
"------------ ------------------------- ------- ------------------ ------ -----------------\n",
"C0L Constant Integers 7 43.75 560 B 43.75\n",
"C1N 1-Byte Integers (w/o NAs) 4 25 280 B 21.875\n",
"C2 2-Byte Integers 2 12.5 144 B 11.25\n",
"C2S 2-Byte Fractions 1 6.25 88 B 6.875\n",
"CStr String 2 12.5 208 B 16.25"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Frame distribution summary: \n"
]
},
{
"data": {
"text/html": [
" | \n",
"size | \n",
"number_of_rows | \n",
"number_of_chunks_per_column | \n",
"number_of_chunks |
\n",
"| 172.16.2.43:54321 | \n",
" 1.3 KB | \n",
"2.0 | \n",
"1.0 | \n",
"16.0 |
\n",
"| mean | \n",
" 1.3 KB | \n",
"2.0 | \n",
"1.0 | \n",
"16.0 |
\n",
"| min | \n",
" 1.3 KB | \n",
"2.0 | \n",
"1.0 | \n",
"16.0 |
\n",
"| max | \n",
" 1.3 KB | \n",
"2.0 | \n",
"1.0 | \n",
"16.0 |
\n",
"| stddev | \n",
" 0 B | \n",
"0.0 | \n",
"0.0 | \n",
"0.0 |
\n",
"| total | \n",
" 1.3 KB | \n",
"2.0 | \n",
"1.0 | \n",
"16.0 |
"
],
"text/plain": [
" size number_of_rows number_of_chunks_per_column number_of_chunks\n",
"----------------- ------ ---------------- ----------------------------- ------------------\n",
"172.16.2.43:54321 1.3 KB 2 1 16\n",
"mean 1.3 KB 2 1 16\n",
"min 1.3 KB 2 1 16\n",
"max 1.3 KB 2 1 16\n",
"stddev 0 B 0 0 0\n",
"total 1.3 KB 2 1 16"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"| | Primary.Type | Domestic | FBI.Code | Ward | District | Community.Area | Location.Description | Date | IUCR | Beat | Day | Month | Year | WeekNum | WeekDay | HourOfDay |
\n",
"| type | string | enum | int | int | int | int | string | int | int | int | int | int | int | int | enum | int |
\n",
"| mins | NaN | 0.0 | 11.0 | 7.0 | 4.0 | 46.0 | NaN | 1423465239000.0 | 1150.0 | 422.0 | 8.0 | 3.0 | 3915.0 | 6.0 | 6.0 | 23.0 |
\n",
"| mean | NaN | 0.0 | 14.5 | 10.5 | 6.5 | 54.5 | NaN | 1423466538500.0 | 1480.5 | 672.5 | 8.0 | 3.0 | 3915.0 | 6.0 | NaN | 23.0 |
\n",
"| maxs | NaN | 0.0 | 18.0 | 14.0 | 9.0 | 63.0 | NaN | 1423467838000.0 | 1811.0 | 923.0 | 8.0 | 3.0 | 3915.0 | 6.0 | 6.0 | 23.0 |
\n",
"| sigma | NaN | 0.0 | 4.949747468305833 | 4.949747468305833 | 3.5355339059327378 | 12.020815280171307 | NaN | 1837770.524303837 | 467.39758236430794 | 354.26049737446033 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 0.0 |
\n",
"| zeros | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
\n",
"| missing | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
\n",
"| 0 | NARCOTICS | false | 18.0 | 7.0 | 4.0 | 46.0 | STREET | 1423467838000.0 | 1811.0 | 422.0 | 8.0 | 3.0 | 3915.0 | 6.0 | Sun | 23.0 |
\n",
"| 1 | DECEPTIVE PRACTICE | false | 11.0 | 14.0 | 9.0 | 63.0 | RESIDENCE | 1423465239000.0 | 1150.0 | 923.0 | 8.0 | 3.0 | 3915.0 | 6.0 | Sun | 23.0 |
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Rows:2 Cols:18\n",
"\n",
"Chunk compression summary: \n"
]
},
{
"data": {
"text/html": [
"| chunk_type | \n",
"chunk_name | \n",
"count | \n",
"count_percentage | \n",
"size | \n",
"size_percentage |
\n",
"| C0L | \n",
"Constant Integers | \n",
"9 | \n",
"50.0 | \n",
" 720 B | \n",
"50.0 |
\n",
"| C1N | \n",
"1-Byte Integers (w/o NAs) | \n",
"4 | \n",
"22.222223 | \n",
" 280 B | \n",
"19.444445 |
\n",
"| C2 | \n",
"2-Byte Integers | \n",
"2 | \n",
"11.111112 | \n",
" 144 B | \n",
"10.0 |
\n",
"| C2S | \n",
"2-Byte Fractions | \n",
"1 | \n",
"5.555556 | \n",
" 88 B | \n",
"6.111111 |
\n",
"| CStr | \n",
"String | \n",
"2 | \n",
"11.111112 | \n",
" 208 B | \n",
"14.444445 |
"
],
"text/plain": [
"chunk_type chunk_name count count_percentage size size_percentage\n",
"------------ ------------------------- ------- ------------------ ------ -----------------\n",
"C0L Constant Integers 9 50 720 B 50\n",
"C1N 1-Byte Integers (w/o NAs) 4 22.2222 280 B 19.4444\n",
"C2 2-Byte Integers 2 11.1111 144 B 10\n",
"C2S 2-Byte Fractions 1 5.55556 88 B 6.11111\n",
"CStr String 2 11.1111 208 B 14.4444"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Frame distribution summary: \n"
]
},
{
"data": {
"text/html": [
" | \n",
"size | \n",
"number_of_rows | \n",
"number_of_chunks_per_column | \n",
"number_of_chunks |
\n",
"| 172.16.2.43:54321 | \n",
" 1.4 KB | \n",
"2.0 | \n",
"1.0 | \n",
"18.0 |
\n",
"| mean | \n",
" 1.4 KB | \n",
"2.0 | \n",
"1.0 | \n",
"18.0 |
\n",
"| min | \n",
" 1.4 KB | \n",
"2.0 | \n",
"1.0 | \n",
"18.0 |
\n",
"| max | \n",
" 1.4 KB | \n",
"2.0 | \n",
"1.0 | \n",
"18.0 |
\n",
"| stddev | \n",
" 0 B | \n",
"0.0 | \n",
"0.0 | \n",
"0.0 |
\n",
"| total | \n",
" 1.4 KB | \n",
"2.0 | \n",
"1.0 | \n",
"18.0 |
"
],
"text/plain": [
" size number_of_rows number_of_chunks_per_column number_of_chunks\n",
"----------------- ------ ---------------- ----------------------------- ------------------\n",
"172.16.2.43:54321 1.4 KB 2 1 18\n",
"mean 1.4 KB 2 1 18\n",
"min 1.4 KB 2 1 18\n",
"max 1.4 KB 2 1 18\n",
"stddev 0 B 0 0 0\n",
"total 1.4 KB 2 1 18"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"| | Primary.Type | Domestic | FBI.Code | Ward | District | Community.Area | Location.Description | Date | IUCR | Beat | Day | Month | Year | WeekNum | WeekDay | HourOfDay | Weekend | Season |
\n",
"| type | string | enum | int | int | int | int | string | int | int | int | int | int | int | int | enum | int | int | enum |
\n",
"| mins | NaN | 0.0 | 11.0 | 7.0 | 4.0 | 46.0 | NaN | 1423465239000.0 | 1150.0 | 422.0 | 8.0 | 3.0 | 3915.0 | 6.0 | 6.0 | 23.0 | 1.0 | 1.0 |
\n",
"| mean | NaN | 0.0 | 14.5 | 10.5 | 6.5 | 54.5 | NaN | 1423466538500.0 | 1480.5 | 672.5 | 8.0 | 3.0 | 3915.0 | 6.0 | NaN | 23.0 | 1.0 | NaN |
\n",
"| maxs | NaN | 0.0 | 18.0 | 14.0 | 9.0 | 63.0 | NaN | 1423467838000.0 | 1811.0 | 923.0 | 8.0 | 3.0 | 3915.0 | 6.0 | 6.0 | 23.0 | 1.0 | 1.0 |
\n",
"| sigma | NaN | 0.0 | 4.949747468305833 | 4.949747468305833 | 3.5355339059327378 | 12.020815280171307 | NaN | 1837770.524303837 | 467.39758236430794 | 354.26049737446033 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | NaN |
\n",
"| zeros | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
\n",
"| missing | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
\n",
"| 0 | NARCOTICS | false | 18.0 | 7.0 | 4.0 | 46.0 | STREET | 1423467838000.0 | 1811.0 | 422.0 | 8.0 | 3.0 | 3915.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"| 1 | DECEPTIVE PRACTICE | false | 11.0 | 14.0 | 9.0 | 63.0 | RESIDENCE | 1423465239000.0 | 1150.0 | 923.0 | 8.0 | 3.0 | 3915.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
\n",
"
"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Create new H2OFrame of crime observations\n",
"examples = {\n",
" \"Date\": [\"02/08/2015 11:43:58 PM\", \"02/08/2015 11:00:39 PM\"],\n",
" \"IUCR\": [1811, 1150],\n",
" \"Primary.Type\": [\"NARCOTICS\", \"DECEPTIVE PRACTICE\"],\n",
" \"Location.Description\": [\"STREET\", \"RESIDENCE\"],\n",
" \"Domestic\": [\"false\", \"false\"],\n",
" \"Beat\": [422, 923],\n",
" \"District\": [4, 9],\n",
" \"Ward\": [7, 14],\n",
" \"Community.Area\": [46, 63],\n",
" \"FBI.Code\": [18, 11]\n",
" }\n",
"\n",
"crime_examples = h2o.H2OFrame(examples)\n",
"\n",
"# Refine date column and merge with census data\n",
"refine_date_col(crime_examples, \"Date\", \"%m/%d/%Y %I:%M:%S %p\")\n",
"crime_examples.drop(\"Date\")\n",
"census.set_name(0,\"Community.Area\")\n",
"crime_examples.merge(census, all_x=True, all_y=False)\n",
"crime_examples.describe()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"| FBI Code | \n",
"GBM Arrest Prob | \n",
"DL Arrest Prob |
\n",
"| 18 | \n",
"0.1199714 | \n",
"0.3047381 |
\n",
"| 11 | \n",
"0.1199714 | \n",
"0.2496035 |
"
],
"text/plain": [
" FBI Code GBM Arrest Prob DL Arrest Prob\n",
"---------- ----------------- ----------------\n",
" 18 0.119971 0.304738\n",
" 11 0.119971 0.249603"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"| FBI Code | \n",
"GBM Arrest Prob | \n",
"DL Arrest Prob |
\n",
"| 18 | \n",
"0.1199714 | \n",
"0.3047381 |
\n",
"| 11 | \n",
"0.1199714 | \n",
"0.2496035 |
"
],
"text/plain": []
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Predict probability of arrest from new observations\n",
"gbm_pred = data_gbm.predict(crime_examples)\n",
"# dl_pred = data_dl .predict(crime_examples)\n",
"\n",
"# Make a pretty HTML table printout of the results\n",
"# header = [\"FBI Code\", \"GBM Arrest Prob\", \"DL Arrest Prob\"]\n",
"# table = [\n",
"# [examples[\"FBI.Code\"][0], gbm_pred[0,\"true\"], dl_pred[0,\"true\"]],\n",
"# [examples[\"FBI.Code\"][1], gbm_pred[1,\"true\"], dl_pred[1,\"true\"]]\n",
"# ]\n",
"header = [\"FBI Code\", \"GBM Arrest Prob\"]\n",
"table = [\n",
" [examples[\"FBI.Code\"][0], gbm_pred[0,\"true\"]],\n",
" [examples[\"FBI.Code\"][1], gbm_pred[1,\"true\"]],\n",
" ]\n",
"h2o.display.H2OTableDisplay(table, columns_labels=header)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.0"
}
},
"nbformat": 4,
"nbformat_minor": 0
}