{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import h2o\n", "from h2o.estimators.gbm import H2OGradientBoostingEstimator\n", "from h2o.estimators.deeplearning import H2ODeepLearningEstimator" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
H2O cluster uptime: 1 hours 13 minutes 22 seconds 521 milliseconds
H2O cluster version: 3.7.0.99999
H2O cluster name: ludirehak
H2O cluster total nodes: 1
H2O cluster total free memory: 3.24 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: True
H2O Connection ip: 127.0.0.1
H2O Connection port: 54321
H2O Connection proxy: None
Python Version: 3.5.1
" ], "text/plain": [ "------------------------------ ----------------------------------------------\n", "H2O cluster uptime: 1 hours 13 minutes 22 seconds 521 milliseconds\n", "H2O cluster version: 3.7.0.99999\n", "H2O cluster name: ludirehak\n", "H2O cluster total nodes: 1\n", "H2O cluster total free memory: 3.24 GB\n", "H2O cluster total cores: 8\n", "H2O cluster allowed cores: 8\n", "H2O cluster healthy: True\n", "H2O Connection ip: 127.0.0.1\n", "H2O Connection port: 54321\n", "H2O Connection proxy:\n", "Python Version: 3.5.1\n", "------------------------------ ----------------------------------------------" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Connect to a cluster\n", "h2o.init()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Import and Parse weather data\n", "\n", "Parse Progress: [##################################################] 100%\n", "Rows:5,162 Cols:7\n", "\n", "Chunk compression summary: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers198.225108 1.5 KB3.3811588
C0DConstant Reals3314.285715 2.6 KB5.872539
C11-Byte Integers83.4632034 1.7 KB3.9528418
C1N1-Byte Integers (w/o NAs)13558.441555 29.6 KB67.35625
C1S1-Byte Fractions3615.584415 8.5 KB19.437214
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ ------- -----------------\n", "C0L Constant Integers 19 8.22511 1.5 KB 3.38116\n", "C0D Constant Reals 33 14.2857 2.6 KB 5.87254\n", "C1 1-Byte Integers 8 3.4632 1.7 KB 3.95284\n", "C1N 1-Byte Integers (w/o NAs) 135 58.4416 29.6 KB 67.3563\n", "C1S 1-Byte Fractions 36 15.5844 8.5 KB 19.4372" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.43:54321 43.9 KB5162.033.0231.0
mean 43.9 KB5162.033.0231.0
min 43.9 KB5162.033.0231.0
max 43.9 KB5162.033.0231.0
stddev 0 B0.00.00.0
total 43.9 KB5162.033.0231.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- ------- ---------------- ----------------------------- ------------------\n", "172.16.2.43:54321 43.9 KB 5162 33 231\n", "mean 43.9 KB 5162 33 231\n", "min 43.9 KB 5162 33 231\n", "max 43.9 KB 5162 33 231\n", "stddev 0 B 0 0 0\n", "total 43.9 KB 5162 33 231" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
date month day year maxTemp meanTemp minTemp
type time int int int int int int
mins NaN 1.0 1.0 2001.0 -2.0 -9.0 -18.0
mean 0.0 6.47442851607904 15.7082526152654012007.571483920960958.87104292095552450.31035152456788 41.4812584967955
maxs NaN 12.0 31.0 2015.0 103.0 93.0 82.0
sigma -0.0 3.4690517169376858.798951739966594 4.077340905700527 21.48297772368538719.93023992660888419.020729712312264
zeros -5162 0 0 0 0 2 16
missing5162 0 0 0 13 13 13
0 nan 1.0 1.0 2001.0 23.0 14.0 6.0
1 nan 1.0 2.0 2001.0 18.0 12.0 6.0
2 nan 1.0 3.0 2001.0 28.0 18.0 8.0
3 nan 1.0 4.0 2001.0 30.0 24.0 19.0
4 nan 1.0 5.0 2001.0 36.0 30.0 21.0
5 nan 1.0 6.0 2001.0 33.0 26.0 19.0
6 nan 1.0 7.0 2001.0 34.0 28.0 21.0
7 nan 1.0 8.0 2001.0 26.0 20.0 14.0
8 nan 1.0 9.0 2001.0 23.0 16.0 10.0
9 nan 1.0 10.0 2001.0 34.0 26.0 19.0
" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Import and Parse census data\n", "\n", "Parse Progress: [##################################################] 100%\n", "Rows:79 Cols:9\n", "\n", "Chunk compression summary: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C11-Byte Integers333.333336 441 B22.546013
C1S1-Byte Fractions111.111112 163 B8.333334
C2S2-Byte Fractions444.444447 968 B49.488754
C44-Byte Integers111.111112 384 B19.6319
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ---------------- ------- ------------------ ------ -----------------\n", "C1 1-Byte Integers 3 33.3333 441 B 22.546\n", "C1S 1-Byte Fractions 1 11.1111 163 B 8.33333\n", "C2S 2-Byte Fractions 4 44.4444 968 B 49.4888\n", "C4 4-Byte Integers 1 11.1111 384 B 19.6319" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.43:54321 1.9 KB79.01.09.0
mean 1.9 KB79.01.09.0
min 1.9 KB79.01.09.0
max 1.9 KB79.01.09.0
stddev 0 B0.00.00.0
total 1.9 KB79.01.09.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- ------ ---------------- ----------------------------- ------------------\n", "172.16.2.43:54321 1.9 KB 79 1 9\n", "mean 1.9 KB 79 1 9\n", "min 1.9 KB 79 1 9\n", "max 1.9 KB 79 1 9\n", "stddev 0 B 0 0 0\n", "total 1.9 KB 79 1 9" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Community Area Number COMMUNITY AREA NAME PERCENT OF HOUSING CROWDED PERCENT HOUSEHOLDS BELOW POVERTY PERCENT AGED 16 UNEMPLOYED PERCENT AGED 25 WITHOUT HIGH SCHOOL DIPLOMA PERCENT AGED UNDER 18 OR OVER 64 PER CAPITA INCOME HARDSHIP INDEX
type int enum real real real real real int int
mins 1.0 0.0 0.30000000000000004 3.3000000000000003 4.7 2.5 13.5 8201.0 1.0
mean 39.0 NaN 4.920512820512822 21.73974358974359 15.341025641025642 20.33076923076924 35.71794871794871 25597.000000000004 49.506493506493506
maxs 77.0 78.0 15.8 56.5 35.9 54.800000000000004 51.5 88669.0 98.0
sigma 22.371857321197094 NaN 3.6589814413502006 11.457230912971083 7.49949670860991 11.746514351100048 7.284421084944952 15196.405541331917 28.69055565156158
zeros 0 1 0 0 0 0 0 0 0
missing2 0 1 1 1 1 1 1 2
0 nan COMMUNITY AREA NAME nan nan nan nan nan nan nan
1 1.0 Rogers Park 7.7 23.6 8.700000000000001 18.2 27.5 23939.0 39.0
2 2.0 West Ridge 7.800000000000001 17.2 8.8 20.8 38.5 23040.0 46.0
3 3.0 Uptown 3.8000000000000003 24.0 8.9 11.8 22.200000000000003 35787.0 20.0
4 4.0 Lincoln Square 3.4000000000000004 10.9 8.200000000000001 13.4 25.5 37524.0 17.0
5 5.0 North Center 0.30000000000000004 7.5 5.2 4.5 26.200000000000003 57123.0 6.0
6 6.0 Lake View 1.1 11.4 4.7 2.6 17.0 60058.0 5.0
7 7.0 Lincoln Park 0.8 12.3 5.1000000000000005 3.6 21.5 71551.0 2.0
8 8.0 Near North Side 1.9000000000000001 12.9 7.0 2.5 22.6 88669.0 1.0
9 9.0 Edison Park 1.1 3.3000000000000003 6.5 7.4 35.300000000000004 40959.0 8.0
" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Import and Parse crimes data\n", "\n", "Parse Progress: [##################################################] 100%\n", "Rows:9,999 Cols:22\n", "\n", "Chunk compression summary: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers14.5454545 80 B0.0092869
C11-Byte Integers836.363636 78.6 KB9.349084
C1N1-Byte Integers (w/o NAs)29.090909 19.7 KB2.337271
C22-Byte Integers418.181818 78.4 KB9.317509
C44-Byte Integers313.636364 117.4 KB13.952581
CStrString29.090909 390.7 KB46.446617
C8D64-bit Reals29.090909 156.4 KB18.587654
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ -------- -----------------\n", "C0L Constant Integers 1 4.54545 80 B 0.00928686\n", "C1 1-Byte Integers 8 36.3636 78.6 KB 9.34908\n", "C1N 1-Byte Integers (w/o NAs) 2 9.09091 19.7 KB 2.33727\n", "C2 2-Byte Integers 4 18.1818 78.4 KB 9.31751\n", "C4 4-Byte Integers 3 13.6364 117.4 KB 13.9526\n", "CStr String 2 9.09091 390.7 KB 46.4466\n", "C8D 64-bit Reals 2 9.09091 156.4 KB 18.5877" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.43:54321 841.2 KB9999.01.022.0
mean 841.2 KB9999.01.022.0
min 841.2 KB9999.01.022.0
max 841.2 KB9999.01.022.0
stddev 0 B0.00.00.0
total 841.2 KB9999.01.022.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- -------- ---------------- ----------------------------- ------------------\n", "172.16.2.43:54321 841.2 KB 9999 1 22\n", "mean 841.2 KB 9999 1 22\n", "min 841.2 KB 9999 1 22\n", "max 841.2 KB 9999 1 22\n", "stddev 0 B 0 0 0\n", "total 841.2 KB 9999 1 22" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
ID Case Number Date Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location
type int string string enum int enum enum enum enum enum int int int int int int int int enum real real enum
mins 21735.0 NaN NaN 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 2015.00.0 41.64507243 -87.906463888 0.0
mean 9931318.737373699NaN NaN NaN 1189.676513569939NaN NaN NaN 0.292829282928292740.152315231523152351159.618061806176511.34898851275791822.95409540954100837.44764476447653612.7401236226821141163880.59814984071885916.14984243082015.0NaN 41.842565224673535 -87.67414052209607 NaN
maxs 9962898.0 NaN NaN 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 2015.032.0 42.022646183 -87.524773286 8603.0
sigma 396787.5642214295NaN NaN NaN 927.7514355826443NaN NaN NaN 0.4550835155878833 0.3593441468595258 695.7602987498396 6.945474933012859 13.64956611436129621.2748762223208567.574238579108433 16496.44936814723831274.01631985589 0.0 NaN 0.086018657935848240.06003579706529789NaN
zeros 0 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1
missing0 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162
0 9955810.0 HY144797 02/08/2015 11:43:40 PM081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 2015.002/15/2015 12:43:39 PM41.747693646 -87.54903538900001 (41.747693646, -87.549035389)
1 9955861.0 HY144838 02/08/2015 11:41:42 PM118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 2015.002/15/2015 12:43:39 PM41.679442289 -87.622850758 (41.679442289, -87.622850758)
2 9955801.0 HY144779 02/08/2015 11:30:22 PM002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 2015.002/15/2015 12:43:39 PM41.877773330000004 -87.755117993 (41.87777333, -87.755117993)
3 9956197.0 HY144787 02/08/2015 11:30:23 PM006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 321.0 nan 6.0 42.0 18.0 nan nan 2015.002/15/2015 12:43:39 PMnan nan
4 9955846.0 HY144829 02/08/2015 11:30:58 PM0000X S MAYFIELD AVE610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 2015.002/15/2015 12:43:39 PM41.880025548000006 -87.77154132400001 (41.880025548, -87.771541324)
5 9955835.0 HY144778 02/08/2015 11:30:21 PM010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 2015.002/15/2015 12:43:39 PM41.807059405000004 -87.65206589 (41.807059405, -87.65206589)
6 9955872.0 HY144822 02/08/2015 11:27:24 PM015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGETO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 2015.002/15/2015 12:43:39 PM41.999814056000005 -87.669342967 (41.999814056, -87.669342967)
7 21752.0 HY144738 02/08/2015 11:26:12 PM060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 2015.002/15/2015 12:43:39 PM41.920755683 -87.776067514 (41.920755683, -87.776067514)
8 9955808.0 HY144775 02/08/2015 11:20:33 PM001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 2015.002/15/2015 12:43:39 PM41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356)
9 9958275.0 HY146732 02/08/2015 11:15:36 PM001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 2015.002/15/2015 12:43:39 PM41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356)
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.\n", "weather_path = _locate(\"smalldata/chicago/chicagoAllWeather.csv\")\n", "census_path = _locate(\"smalldata/chicago/chicagoCensus.csv\")\n", "crimes_path = _locate(\"smalldata/chicago/chicagoCrimes10k.csv.zip\")\n", "\n", "print(\"Import and Parse weather data\")\n", "weather = h2o.import_file(path=weather_path, col_types = [\"time\"] + [\"numeric\"]*6)\n", "weather.drop(\"date\")\n", "weather.describe()\n", "\n", "print(\"Import and Parse census data\")\n", "census = h2o.import_file(path=census_path, col_types = [\"numeric\", \"enum\"] + [\"numeric\"]*7)\n", "census.describe()\n", "\n", "print(\"Import and Parse crimes data\")\n", "crimes = h2o.import_file(path=crimes_path)\n", "crimes.describe()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows:9,999 Cols:27\n", "\n", "Chunk compression summary: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers13.7037036 80 B0.0110837
C11-Byte Integers829.62963 78.6 KB11.157955
C1N1-Byte Integers (w/o NAs)725.925926 68.8 KB9.763211
C22-Byte Integers414.814815 78.4 KB11.12027
C44-Byte Integers311.111112 117.4 KB16.652143
C864-bit Integers13.7037036 78.2 KB11.092008
CStrString13.7037036 127.0 KB18.019316
C8D64-bit Reals27.4074073 156.4 KB22.184013
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ -------- -----------------\n", "C0L Constant Integers 1 3.7037 80 B 0.0110837\n", "C1 1-Byte Integers 8 29.6296 78.6 KB 11.158\n", "C1N 1-Byte Integers (w/o NAs) 7 25.9259 68.8 KB 9.76321\n", "C2 2-Byte Integers 4 14.8148 78.4 KB 11.1203\n", "C4 4-Byte Integers 3 11.1111 117.4 KB 16.6521\n", "C8 64-bit Integers 1 3.7037 78.2 KB 11.092\n", "CStr String 1 3.7037 127.0 KB 18.0193\n", "C8D 64-bit Reals 2 7.40741 156.4 KB 22.184" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.43:54321 704.9 KB9999.01.027.0
mean 704.9 KB9999.01.027.0
min 704.9 KB9999.01.027.0
max 704.9 KB9999.01.027.0
stddev 0 B0.00.00.0
total 704.9 KB9999.01.027.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- -------- ---------------- ----------------------------- ------------------\n", "172.16.2.43:54321 704.9 KB 9999 1 27\n", "mean 704.9 KB 9999 1 27\n", "min 704.9 KB 9999 1 27\n", "max 704.9 KB 9999 1 27\n", "stddev 0 B 0 0 0\n", "total 704.9 KB 9999 1 27" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
ID Case Number Date Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay
type int string int enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int
mins 21735.0 NaN 1422030630000.0 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 3915.00.0 41.64507243 -87.906463888 0.0 1.0 2.0 4.0 0.0 0.0
mean 9931318.737373699NaN 1422714450809.2847NaN 1189.676513569939NaN NaN NaN 0.292829282928292740.152315231523152351159.618061806176511.34898851275791822.95409540954100837.44764476447653612.7401236226821141163880.59814984071885916.14984243083915.0NaN 41.842565224673535 -87.67414052209607 NaN 17.6839683968396632.419441944194423 5.1808180818082 NaN 13.631963196319662
maxs 9962898.0 NaN 1423467820000.0 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 3915.032.0 42.022646183 -87.524773286 8603.0 31.0 3.0 6.0 6.0 23.0
sigma 396787.5642214295NaN 433879245.1905283 NaN 927.7514355826443NaN NaN NaN 0.4550835155878833 0.3593441468595258 695.7602987498396 6.945474933012859 13.64956611436129621.2748762223208567.574238579108433 16496.44936814723831274.01631985589 0.0 NaN 0.086018657935848240.06003579706529789NaN 11.1801043358277020.49349240678653860.7389298304087689NaN 6.4732173580715475
zeros 0 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374
missing0 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0
0 9955810.0 HY144797 1423467820000.0 081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 3915.002/15/2015 12:43:39 PM41.747693646 -87.54903538900001 (41.747693646, -87.549035389)8.0 3.0 6.0 Sun 23.0
1 9955861.0 HY144838 1423467702000.0 118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 3915.002/15/2015 12:43:39 PM41.679442289 -87.622850758 (41.679442289, -87.622850758)8.0 3.0 6.0 Sun 23.0
2 9955801.0 HY144779 1423467022000.0 002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 3915.002/15/2015 12:43:39 PM41.877773330000004 -87.755117993 (41.87777333, -87.755117993) 8.0 3.0 6.0 Sun 23.0
3 9956197.0 HY144787 1423467023000.0 006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 321.0 nan 6.0 42.0 18.0 nan nan 3915.002/15/2015 12:43:39 PMnan nan 8.0 3.0 6.0 Sun 23.0
4 9955846.0 HY144829 1423467058000.0 0000X S MAYFIELD AVE610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 3915.002/15/2015 12:43:39 PM41.880025548000006 -87.77154132400001 (41.880025548, -87.771541324)8.0 3.0 6.0 Sun 23.0
5 9955835.0 HY144778 1423467021000.0 010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 3915.002/15/2015 12:43:39 PM41.807059405000004 -87.65206589 (41.807059405, -87.65206589) 8.0 3.0 6.0 Sun 23.0
6 9955872.0 HY144822 1423466844000.0 015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGETO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 3915.002/15/2015 12:43:39 PM41.999814056000005 -87.669342967 (41.999814056, -87.669342967)8.0 3.0 6.0 Sun 23.0
7 21752.0 HY144738 1423466772000.0 060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 3915.002/15/2015 12:43:39 PM41.920755683 -87.776067514 (41.920755683, -87.776067514)8.0 3.0 6.0 Sun 23.0
8 9955808.0 HY144775 1423466433000.0 001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.002/15/2015 12:43:39 PM41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356)8.0 3.0 6.0 Sun 23.0
9 9958275.0 HY146732 1423466136000.0 001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.002/15/2015 12:43:39 PM41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356)8.0 3.0 6.0 Sun 23.0
" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Rows:9,999 Cols:28\n", "\n", "Chunk compression summary: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers13.5714288 80 B0.0124154
CBSBits27.1428576 2.6 KB0.4097082
C11-Byte Integers828.57143 78.6 KB12.498584
C1N1-Byte Integers (w/o NAs)725.0 68.8 KB10.936261
C22-Byte Integers414.285715 78.4 KB12.456371
C44-Byte Integers310.714286 117.4 KB18.652899
CStrString13.5714288 127.0 KB20.184338
C8D64-bit Reals27.1428576 156.4 KB24.849424
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ -------- -----------------\n", "C0L Constant Integers 1 3.57143 80 B 0.0124154\n", "CBS Bits 2 7.14286 2.6 KB 0.409708\n", "C1 1-Byte Integers 8 28.5714 78.6 KB 12.4986\n", "C1N 1-Byte Integers (w/o NAs) 7 25 68.8 KB 10.9363\n", "C2 2-Byte Integers 4 14.2857 78.4 KB 12.4564\n", "C4 4-Byte Integers 3 10.7143 117.4 KB 18.6529\n", "CStr String 1 3.57143 127.0 KB 20.1843\n", "C8D 64-bit Reals 2 7.14286 156.4 KB 24.8494" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.43:54321 629.3 KB9999.01.028.0
mean 629.3 KB9999.01.028.0
min 629.3 KB9999.01.028.0
max 629.3 KB9999.01.028.0
stddev 0 B0.00.00.0
total 629.3 KB9999.01.028.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- -------- ---------------- ----------------------------- ------------------\n", "172.16.2.43:54321 629.3 KB 9999 1 28\n", "mean 629.3 KB 9999 1 28\n", "min 629.3 KB 9999 1 28\n", "max 629.3 KB 9999 1 28\n", "stddev 0 B 0 0 0\n", "total 629.3 KB 9999 1 28" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
ID Case Number Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay Weekend Season
type int string enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int int enum
mins 21735.0 NaN 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 3915.00.0 41.64507243 -87.906463888 0.0 1.0 2.0 4.0 0.0 0.0 0.0 0.0
mean 9931318.737373699NaN NaN 1189.676513569939NaN NaN NaN 0.292829282928292740.152315231523152351159.618061806176511.34898851275791822.95409540954100837.44764476447653612.7401236226821141163880.59814984071885916.14984243083915.0NaN 41.842565224673535 -87.67414052209607 NaN 17.6839683968396632.419441944194423 5.1808180818082 NaN 13.6319631963196620.35753575357535755NaN
maxs 9962898.0 NaN 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 3915.032.0 42.022646183 -87.524773286 8603.0 31.0 3.0 6.0 6.0 23.0 1.0 1.0
sigma 396787.5642214295NaN NaN 927.7514355826443NaN NaN NaN 0.4550835155878833 0.3593441468595258 695.7602987498396 6.945474933012859 13.64956611436129621.2748762223208567.574238579108433 16496.44936814723831274.01631985589 0.0 NaN 0.086018657935848240.06003579706529789NaN 11.1801043358277020.49349240678653860.7389298304087689NaN 6.47321735807154750.47929835538994453NaN
zeros 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374 6424 5805
missing0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0 0 0
0 9955810.0 HY144797 081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 3915.002/15/2015 12:43:39 PM41.747693646 -87.54903538900001 (41.747693646, -87.549035389)8.0 3.0 6.0 Sun 23.0 1.0 Spring
1 9955861.0 HY144838 118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 3915.002/15/2015 12:43:39 PM41.679442289 -87.622850758 (41.679442289, -87.622850758)8.0 3.0 6.0 Sun 23.0 1.0 Spring
2 9955801.0 HY144779 002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 3915.002/15/2015 12:43:39 PM41.877773330000004 -87.755117993 (41.87777333, -87.755117993) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
3 9956197.0 HY144787 006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 321.0 nan 6.0 42.0 18.0 nan nan 3915.002/15/2015 12:43:39 PMnan nan 8.0 3.0 6.0 Sun 23.0 1.0 Spring
4 9955846.0 HY144829 0000X S MAYFIELD AVE610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 3915.002/15/2015 12:43:39 PM41.880025548000006 -87.77154132400001 (41.880025548, -87.771541324)8.0 3.0 6.0 Sun 23.0 1.0 Spring
5 9955835.0 HY144778 010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 3915.002/15/2015 12:43:39 PM41.807059405000004 -87.65206589 (41.807059405, -87.65206589) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
6 9955872.0 HY144822 015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGETO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 3915.002/15/2015 12:43:39 PM41.999814056000005 -87.669342967 (41.999814056, -87.669342967)8.0 3.0 6.0 Sun 23.0 1.0 Spring
7 21752.0 HY144738 060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 3915.002/15/2015 12:43:39 PM41.920755683 -87.776067514 (41.920755683, -87.776067514)8.0 3.0 6.0 Sun 23.0 1.0 Spring
8 9955808.0 HY144775 001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.002/15/2015 12:43:39 PM41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356)8.0 3.0 6.0 Sun 23.0 1.0 Spring
9 9958275.0 HY146732 001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.002/15/2015 12:43:39 PM41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356)8.0 3.0 6.0 Sun 23.0 1.0 Spring
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def refine_date_col(data, col, pattern):\n", " #data[col] = data[col].as_date(pattern) # As of 5/29/2106 H2O defaults parse as a date\n", " data[\"Day\"] = data[col].day()\n", " data[\"Month\"] = data[col].month() + 1 # Since H2O indexes from 0\n", " data[\"Year\"] = data[col].year() + 1900 # Start of epoch is 1900\n", " data[\"WeekNum\"] = data[col].week()\n", " data[\"WeekDay\"] = data[col].dayOfWeek()\n", " data[\"HourOfDay\"] = data[col].hour()\n", " \n", " data.describe() # HACK: Force evaluation before ifelse and cut. See PUBDEV-1425.\n", " \n", " # Create weekend and season cols\n", " # Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.\n", " # data[\"Weekend\"] = [1 if x in (\"Sun\", \"Sat\") else 0 for x in data[\"WeekDay\"]]\n", " data[\"Weekend\"] = ((data[\"WeekDay\"] == \"Sun\") | (data[\"WeekDay\"] == \"Sat\"))\n", " data[\"Season\"] = data[\"Month\"].cut([0, 2, 5, 7, 10, 12], [\"Winter\", \"Spring\", \"Summer\", \"Autumn\", \"Winter\"])\n", " \n", "refine_date_col(crimes, \"Date\", \"%m/%d/%Y %I:%M:%S %p\")\n", "crimes = crimes.drop(\"Date\")\n", "crimes.describe()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rows:9,999 Cols:28\n", "\n", "Chunk compression summary: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers13.5714288 80 B0.0124154
CBSBits27.1428576 2.6 KB0.4097082
C11-Byte Integers828.57143 78.6 KB12.498584
C1N1-Byte Integers (w/o NAs)725.0 68.8 KB10.936261
C22-Byte Integers414.285715 78.4 KB12.456371
C44-Byte Integers310.714286 117.4 KB18.652899
CStrString13.5714288 127.0 KB20.184338
C8D64-bit Reals27.1428576 156.4 KB24.849424
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ -------- -----------------\n", "C0L Constant Integers 1 3.57143 80 B 0.0124154\n", "CBS Bits 2 7.14286 2.6 KB 0.409708\n", "C1 1-Byte Integers 8 28.5714 78.6 KB 12.4986\n", "C1N 1-Byte Integers (w/o NAs) 7 25 68.8 KB 10.9363\n", "C2 2-Byte Integers 4 14.2857 78.4 KB 12.4564\n", "C4 4-Byte Integers 3 10.7143 117.4 KB 18.6529\n", "CStr String 1 3.57143 127.0 KB 20.1843\n", "C8D 64-bit Reals 2 7.14286 156.4 KB 24.8494" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.43:54321 629.3 KB9999.01.028.0
mean 629.3 KB9999.01.028.0
min 629.3 KB9999.01.028.0
max 629.3 KB9999.01.028.0
stddev 0 B0.00.00.0
total 629.3 KB9999.01.028.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- -------- ---------------- ----------------------------- ------------------\n", "172.16.2.43:54321 629.3 KB 9999 1 28\n", "mean 629.3 KB 9999 1 28\n", "min 629.3 KB 9999 1 28\n", "max 629.3 KB 9999 1 28\n", "stddev 0 B 0 0 0\n", "total 629.3 KB 9999 1 28" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
ID Case Number Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay Weekend Season
type int string enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int int enum
mins 21735.0 NaN 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 3915.00.0 41.64507243 -87.906463888 0.0 1.0 2.0 4.0 0.0 0.0 0.0 0.0
mean 9931318.737373699NaN NaN 1189.676513569939NaN NaN NaN 0.292829282928292740.152315231523152351159.618061806176511.34898851275791822.95409540954100837.44764476447653612.7401236226821141163880.59814984071885916.14984243083915.0NaN 41.842565224673535 -87.67414052209607 NaN 17.6839683968396632.419441944194423 5.1808180818082 NaN 13.6319631963196620.35753575357535755NaN
maxs 9962898.0 NaN 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 3915.032.0 42.022646183 -87.524773286 8603.0 31.0 3.0 6.0 6.0 23.0 1.0 1.0
sigma 396787.5642214295NaN NaN 927.7514355826443NaN NaN NaN 0.4550835155878833 0.3593441468595258 695.7602987498396 6.945474933012859 13.64956611436129621.2748762223208567.574238579108433 16496.44936814723831274.01631985589 0.0 NaN 0.086018657935848240.06003579706529789NaN 11.1801043358277020.49349240678653860.7389298304087689NaN 6.47321735807154750.47929835538994453NaN
zeros 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374 6424 5805
missing0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0 0 0
0 9955810.0 HY144797 081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 3915.002/15/2015 12:43:39 PM41.747693646 -87.54903538900001 (41.747693646, -87.549035389)8.0 3.0 6.0 Sun 23.0 1.0 Spring
1 9955861.0 HY144838 118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 3915.002/15/2015 12:43:39 PM41.679442289 -87.622850758 (41.679442289, -87.622850758)8.0 3.0 6.0 Sun 23.0 1.0 Spring
2 9955801.0 HY144779 002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 3915.002/15/2015 12:43:39 PM41.877773330000004 -87.755117993 (41.87777333, -87.755117993) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
3 9956197.0 HY144787 006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 321.0 nan 6.0 42.0 18.0 nan nan 3915.002/15/2015 12:43:39 PMnan nan 8.0 3.0 6.0 Sun 23.0 1.0 Spring
4 9955846.0 HY144829 0000X S MAYFIELD AVE610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 3915.002/15/2015 12:43:39 PM41.880025548000006 -87.77154132400001 (41.880025548, -87.771541324)8.0 3.0 6.0 Sun 23.0 1.0 Spring
5 9955835.0 HY144778 010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 3915.002/15/2015 12:43:39 PM41.807059405000004 -87.65206589 (41.807059405, -87.65206589) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
6 9955872.0 HY144822 015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGETO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 3915.002/15/2015 12:43:39 PM41.999814056000005 -87.669342967 (41.999814056, -87.669342967)8.0 3.0 6.0 Sun 23.0 1.0 Spring
7 21752.0 HY144738 060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 3915.002/15/2015 12:43:39 PM41.920755683 -87.776067514 (41.920755683, -87.776067514)8.0 3.0 6.0 Sun 23.0 1.0 Spring
8 9955808.0 HY144775 001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.002/15/2015 12:43:39 PM41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356)8.0 3.0 6.0 Sun 23.0 1.0 Spring
9 9958275.0 HY146732 001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.002/15/2015 12:43:39 PM41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356)8.0 3.0 6.0 Sun 23.0 1.0 Spring
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Merge crimes data with weather and census\n", "census.set_name(0,\"Community Area\")\n", "weather.set_name(1,\"Month\")\n", "weather.set_name(2,\"Day\")\n", "weather.set_name(3,\"Year\")\n", "crimes.merge(census, all_x=True, all_y=False)\n", "crimes.merge(weather, all_x=True, all_y=False)\n", "crimes.describe()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "gbm Model Build Progress: [##################################################] 100%\n", "\n", "deeplearning Model Build Progress: [##################################################] 100%\n" ] } ], "source": [ "# Create test/train split\n", "r = crimes[\"Arrest\"].runif(1234)\n", "train = crimes[r < 0.8]\n", "test = crimes[r >= 0.8]\n", "\n", "# Simple GBM - Predict Arrest\n", "crimes_names_x = crimes.names[:]\n", "crimes_names_x.remove(\"Arrest\")\n", "data_gbm = H2OGradientBoostingEstimator(ntrees =10,\n", " max_depth =6,\n", " distribution =\"bernoulli\")\n", "\n", "data_gbm.train(x =crimes_names_x,\n", " y =\"Arrest\",\n", " training_frame =train,\n", " validation_frame=test)\n", "\n", "# Simple Deep Learning - Predict Arrest\n", "# data_dl = H2ODeepLearningEstimator(variable_importances=True,\n", "# loss =\"Automatic\")\n", "\n", "# data_dl.train(x =crimes_names_x,\n", "# y =\"Arrest\",\n", "# training_frame =train,\n", "# validation_frame=test)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
ModelAUC TrainAUC Test
GBM0.95682210.9307979
DL 0.89560550.8841564
" ], "text/plain": [ "Model AUC Train AUC Test\n", "------- ----------- ----------\n", "GBM 0.956822 0.930798\n", "DL 0.895605 0.884156" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
ModelAUC TrainAUC Test
GBM0.95682210.9307979
DL 0.89560550.8841564
" ], "text/plain": [] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# GBM performance on train/test data\n", "train_auc_gbm = data_gbm.model_performance(train).auc()\n", "test_auc_gbm = data_gbm.model_performance(test) .auc()\n", "\n", "# Deep Learning performance on train/test data\n", "# train_auc_dl = data_dl.model_performance(train).auc()\n", "# test_auc_dl = data_dl.model_performance(test) .auc()\n", "\n", "# Make a pretty HTML table printout of the results\n", "header = [\"Model\", \"AUC Train\", \"AUC Test\"]\n", "table = [\n", " [\"GBM\", train_auc_gbm, test_auc_gbm],\n", "# [\"DL \", train_auc_dl, test_auc_dl]\n", " ]\n", "h2o.display.H2OTableDisplay(table, columns_labels=header)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Parse Progress: [##################################################] 100%\n", "Rows:2 Cols:16\n", "\n", "Chunk compression summary: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers743.75 560 B43.75
C1N1-Byte Integers (w/o NAs)425.0 280 B21.875
C22-Byte Integers212.5 144 B11.25
C2S2-Byte Fractions16.25 88 B6.875
CStrString212.5 208 B16.25
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ ------ -----------------\n", "C0L Constant Integers 7 43.75 560 B 43.75\n", "C1N 1-Byte Integers (w/o NAs) 4 25 280 B 21.875\n", "C2 2-Byte Integers 2 12.5 144 B 11.25\n", "C2S 2-Byte Fractions 1 6.25 88 B 6.875\n", "CStr String 2 12.5 208 B 16.25" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.43:54321 1.3 KB2.01.016.0
mean 1.3 KB2.01.016.0
min 1.3 KB2.01.016.0
max 1.3 KB2.01.016.0
stddev 0 B0.00.00.0
total 1.3 KB2.01.016.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- ------ ---------------- ----------------------------- ------------------\n", "172.16.2.43:54321 1.3 KB 2 1 16\n", "mean 1.3 KB 2 1 16\n", "min 1.3 KB 2 1 16\n", "max 1.3 KB 2 1 16\n", "stddev 0 B 0 0 0\n", "total 1.3 KB 2 1 16" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Primary.Type Domestic FBI.Code Ward District Community.Area Location.Description Date IUCR Beat Day Month Year WeekNum WeekDay HourOfDay
type string enum int int int int string int int int int int int int enum int
mins NaN 0.0 11.0 7.0 4.0 46.0 NaN 1423465239000.0 1150.0 422.0 8.0 3.0 3915.06.0 6.0 23.0
mean NaN 0.0 14.5 10.5 6.5 54.5 NaN 1423466538500.0 1480.5 672.5 8.0 3.0 3915.06.0 NaN 23.0
maxs NaN 0.0 18.0 14.0 9.0 63.0 NaN 1423467838000.0 1811.0 923.0 8.0 3.0 3915.06.0 6.0 23.0
sigma NaN 0.0 4.9497474683058334.9497474683058333.535533905932737812.020815280171307NaN 1837770.524303837467.39758236430794354.260497374460330.0 0.0 0.0 0.0 NaN 0.0
zeros 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0
missing0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 NARCOTICS false 18.0 7.0 4.0 46.0 STREET 1423467838000.0 1811.0 422.0 8.0 3.0 3915.06.0 Sun 23.0
1 DECEPTIVE PRACTICEfalse 11.0 14.0 9.0 63.0 RESIDENCE 1423465239000.0 1150.0 923.0 8.0 3.0 3915.06.0 Sun 23.0
" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Rows:2 Cols:18\n", "\n", "Chunk compression summary: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
chunk_typechunk_namecountcount_percentagesizesize_percentage
C0LConstant Integers950.0 720 B50.0
C1N1-Byte Integers (w/o NAs)422.222223 280 B19.444445
C22-Byte Integers211.111112 144 B10.0
C2S2-Byte Fractions15.555556 88 B6.111111
CStrString211.111112 208 B14.444445
" ], "text/plain": [ "chunk_type chunk_name count count_percentage size size_percentage\n", "------------ ------------------------- ------- ------------------ ------ -----------------\n", "C0L Constant Integers 9 50 720 B 50\n", "C1N 1-Byte Integers (w/o NAs) 4 22.2222 280 B 19.4444\n", "C2 2-Byte Integers 2 11.1111 144 B 10\n", "C2S 2-Byte Fractions 1 5.55556 88 B 6.11111\n", "CStr String 2 11.1111 208 B 14.4444" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Frame distribution summary: \n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
sizenumber_of_rowsnumber_of_chunks_per_columnnumber_of_chunks
172.16.2.43:54321 1.4 KB2.01.018.0
mean 1.4 KB2.01.018.0
min 1.4 KB2.01.018.0
max 1.4 KB2.01.018.0
stddev 0 B0.00.00.0
total 1.4 KB2.01.018.0
" ], "text/plain": [ " size number_of_rows number_of_chunks_per_column number_of_chunks\n", "----------------- ------ ---------------- ----------------------------- ------------------\n", "172.16.2.43:54321 1.4 KB 2 1 18\n", "mean 1.4 KB 2 1 18\n", "min 1.4 KB 2 1 18\n", "max 1.4 KB 2 1 18\n", "stddev 0 B 0 0 0\n", "total 1.4 KB 2 1 18" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
Primary.Type Domestic FBI.Code Ward District Community.Area Location.Description Date IUCR Beat Day Month Year WeekNum WeekDay HourOfDay Weekend Season
type string enum int int int int string int int int int int int int enum int int enum
mins NaN 0.0 11.0 7.0 4.0 46.0 NaN 1423465239000.0 1150.0 422.0 8.0 3.0 3915.06.0 6.0 23.0 1.0 1.0
mean NaN 0.0 14.5 10.5 6.5 54.5 NaN 1423466538500.0 1480.5 672.5 8.0 3.0 3915.06.0 NaN 23.0 1.0 NaN
maxs NaN 0.0 18.0 14.0 9.0 63.0 NaN 1423467838000.0 1811.0 923.0 8.0 3.0 3915.06.0 6.0 23.0 1.0 1.0
sigma NaN 0.0 4.9497474683058334.9497474683058333.535533905932737812.020815280171307NaN 1837770.524303837467.39758236430794354.260497374460330.0 0.0 0.0 0.0 NaN 0.0 0.0 NaN
zeros 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
missing0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 NARCOTICS false 18.0 7.0 4.0 46.0 STREET 1423467838000.0 1811.0 422.0 8.0 3.0 3915.06.0 Sun 23.0 1.0 Spring
1 DECEPTIVE PRACTICEfalse 11.0 14.0 9.0 63.0 RESIDENCE 1423465239000.0 1150.0 923.0 8.0 3.0 3915.06.0 Sun 23.0 1.0 Spring
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Create new H2OFrame of crime observations\n", "examples = {\n", " \"Date\": [\"02/08/2015 11:43:58 PM\", \"02/08/2015 11:00:39 PM\"],\n", " \"IUCR\": [1811, 1150],\n", " \"Primary.Type\": [\"NARCOTICS\", \"DECEPTIVE PRACTICE\"],\n", " \"Location.Description\": [\"STREET\", \"RESIDENCE\"],\n", " \"Domestic\": [\"false\", \"false\"],\n", " \"Beat\": [422, 923],\n", " \"District\": [4, 9],\n", " \"Ward\": [7, 14],\n", " \"Community.Area\": [46, 63],\n", " \"FBI.Code\": [18, 11]\n", " }\n", "\n", "crime_examples = h2o.H2OFrame(examples)\n", "\n", "# Refine date column and merge with census data\n", "refine_date_col(crime_examples, \"Date\", \"%m/%d/%Y %I:%M:%S %p\")\n", "crime_examples.drop(\"Date\")\n", "census.set_name(0,\"Community.Area\")\n", "crime_examples.merge(census, all_x=True, all_y=False)\n", "crime_examples.describe()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
FBI CodeGBM Arrest ProbDL Arrest Prob
180.11997140.3047381
110.11997140.2496035
" ], "text/plain": [ " FBI Code GBM Arrest Prob DL Arrest Prob\n", "---------- ----------------- ----------------\n", " 18 0.119971 0.304738\n", " 11 0.119971 0.249603" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
FBI CodeGBM Arrest ProbDL Arrest Prob
180.11997140.3047381
110.11997140.2496035
" ], "text/plain": [] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Predict probability of arrest from new observations\n", "gbm_pred = data_gbm.predict(crime_examples)\n", "# dl_pred = data_dl .predict(crime_examples)\n", "\n", "# Make a pretty HTML table printout of the results\n", "# header = [\"FBI Code\", \"GBM Arrest Prob\", \"DL Arrest Prob\"]\n", "# table = [\n", "# [examples[\"FBI.Code\"][0], gbm_pred[0,\"true\"], dl_pred[0,\"true\"]],\n", "# [examples[\"FBI.Code\"][1], gbm_pred[1,\"true\"], dl_pred[1,\"true\"]]\n", "# ]\n", "header = [\"FBI Code\", \"GBM Arrest Prob\"]\n", "table = [\n", " [examples[\"FBI.Code\"][0], gbm_pred[0,\"true\"]],\n", " [examples[\"FBI.Code\"][1], gbm_pred[1,\"true\"]],\n", " ]\n", "h2o.display.H2OTableDisplay(table, columns_labels=header)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.0" } }, "nbformat": 4, "nbformat_minor": 0 }